Talk:Ns urlencode
Revision as of 18:11, 7 October 2005 by Dossy (talk | contribs) (Moved content from Discussion of ns urlencode to this discussion page)
RHS 21Oct2004
The command ns_urlencode currently doesn't do what it's name implies. What it does is to encode everything as if it were a query value. That means that you can do:
set url "http://www.aolserver.com/a/page/name?" set queryList {} foreach {key value} {key1 val1 key2 val2 ... ... keyN valN} { lappend queryList [ns_urlencode $key]=[ns_urlencode $value] } append url [join $queryList &] # http://www.aolserver.com/a/page/name?key1=val1&key2=val2&%2e%2e%2e=%2e%2e%2e&keyN=valN
You cannot, however, do the following and have it behave correctly:
set url [ns_urlencode {http://www.aolserver.com/a/page/name?key1=value1&key2=value2&....&keyN=valueN}]
# http%3a%2f%2fwww%2eaolserver%2ecom%2fa%2fpage%2fname%3fkey1%3dvalue1%26key2%3dvalue2%26%2e%2e%2e%2e%26keyN%3dvalueN
I'd like to change ns_urlencode (and ns_urldecode, which has its own issues) to be able to handle actual URLs (or, specifically, URIs). The discussions I've had with Dossy over the subject lead to 2 possible solutions to the issue:
1. Add flags to ns_urlencode (and decode) for -url and -query, which would change the behavior to either "treat this as a full url, and escape it as needed" or "treat this as a query element, and escape it as needed". The default behavior would be that of -query (the same as it is now) 2. Add a new command ns_url (with subcommands encode and decode) which would take the same flags, but the default would be to treat the input as if -url was specified. The original commands would then be depricated (but still available for a long time).
Questions:
- How to encode things. I would assume the default encoding machanism would follow the rules of http (ie, use + for space in the query section, but %20 in the resource section)?
- Is it worthwhile implementing any other schemes, or should they all just follow the http lead?
- Should the scheme be added as a flag, determined from the url (which may or may not have the scheme) or both?
- Are there any test cases out there for url encoding? I searched, but couldn't find any.
The code I have currently:
# File: url.tcl
namespace eval ::ns::url {
}
proc ns_urlencode {args} {
if { [llength $args] == 0 } {
error "wrong # args: should be \"ns_urlencode ... data\""
}
set charset utf-8
set mode query
while { [llength $args] > 1 } {
switch -exact -- [lindex $args 0] {
-charset {
set charset [lindex $args 1]
set args [lrange $args 2 end]
}
-query {
set mode query
set args [lrange $args 1 end]
}
-url {
set mode url
set args [lrange $args 1 end]
}
-html {
set mode html
set args [lrange $args 1 end]
}
default {
error "invalid option \"[lindex $args 0]\": should be\
\"ns_urlencode ... data\""
}
}
}
set url [lindex $args end]
set url [encoding convertfrom $charset $url]
set url [encoding convertto identity $url]
switch -exact -- $mode {
query {
set result [::ns::url::encodeQuery $url]
}
url {
set result [::ns::url::encodeUrl $url]
}
html {
set result [::ns::url::encodeUrl $url]
if { [string equal $mode html] } {
set result [[string map [list \
& {&} \
< {<} \
> {>} \
\" {"} ]] $result]
}
}
default {
error "should not happen"
}
}
return $result
}
proc ns_urldecode {args} {
if { [llength $args] == 0 } {
error "wrong # args: should be \"ns_urldecode ... data\""
}
set charset utf-8
set mode query
while { [llength $args] > 1 } {
switch -exact -- [lindex $args 0] {
-charset {
set charset [lindex $args 1]
set args [lrange $args 2 end]
}
default {
error "invalid option \"[lindex $args 0]\": should be\
\"ns_urldecode ... data\""
}
}
}
set url [lindex $args end]
set matchList [regexp -all -inline {%([0-9a-fA-F]{2})} $url]
set mapList {+ { }}
foreach {str code} $matchList {
lappend mapList $str [format %c [scan $code %x]]
}
set url [string map $mapList $url]
set url [encoding convertfrom $charset $url]
return $url
}
proc ::ns::url::encodeQuery {url} {
set output ""
#set encoding iso8859-1
set re {^[a-zA-Z0-9 ]$}
foreach elem [split $url ""] {
#if { ![string is alnum -strict $elem] } {}
if { ![regexp $re $elem] } {
scan $elem "%c" elem
set elem %[format %x $elem]
} elseif { [string equal " " $elem] } {
set elem +
}
append output $elem
}
return $output
}
proc ::ns::url::encodeResource {url} {
set output ""
set alphanum {[a-zA-Z0-9]}
set mark {[-_.!~*'()]}
set other {[:@&=+$,]}
set special "\[/#\]"
set re "${alphanum}|${mark}|${other}|${special}"
set fragCount 0
set index 0
while { [set index [string first "#" $url $index]] >= 0 } {
incr fragCount
incr index
}
foreach elem [split $url ""] {
if { ![regexp $re $elem] } {
scan $elem "%c" elem
set elem %[format %x $elem]
} elseif { [string equal " " $elem] } {
set elem %20
} elseif { [string equal "#" $elem] && ($fragCount > 1) } {
set elem %23
incr fragCount -1
}
append output $elem
}
return $output
}
proc ::ns::url::encodeQueryString {query} {
set output ""
set outList {}
foreach elem [split $query &] {
set index [string first = $elem]
if { $index >= 0 } {
set key [string range $elem 0 [expr {$index -1}]]
set value [string range $elem [expr {$index +1}] end]
lappend outList "[encodeQuery $key]]=[[encodeQuery $value]"
} else {
lappend outList [encodeQuery $elem]
}
}
return [join $outList &]
}
proc ::ns::url::encodeFragment {url} {
set output ""
set reserved {[;/?:@&=+$,]}
set alphanum {[0-9a-zA-Z]}
set mark {[-_.!~*'()]}
set validRE "^(${reserved}|${alphanum}|${mark})*\$"
set charRE "^${alphanum}|${mark}\$"
# Should we throw an error for an invalid fragment?
# Are we even sure what an invalid fragment is?
foreach elem [split $url ""] {
if { ![regexp $charRE $elem] } {
scan $elem "%c" elem
set elem %[format %x $elem]
}
append output $elem
}
return $output
}
proc ::ns::url::encodeUrl {url} {
set urlRegexp {^((([^:/?#]]+):)?(//([[^/?#]]*))?([[^?#]]*))(\?([[^#]*))?(#(.*))?}
if { ![[regexp $urlRegexp $url -> \
resource p1 p2 p3 p4 p5 query p7 fragment p9]] } {
error "Invalid url"
}
set result [::ns::url::encodeResource $resource]
if { [info exists query]] && [[string length $query] } {
if { ![info exists p7] } {
set p7 ""
}
append result ?[::ns::url::encodeQueryString $p7]
}
if { [info exists fragment]] && [[string length $fragment] } {
if { ![info exists p9] } {
set p9 ""
}
append result #[::ns::url::encodeFragment $p9]
}
return $result
}
# ########################################
# File: url.test
package require tcltest
namespace import tcltest::*
source url.tcl
# ========================================
test encodeQueryString-1.1 {
Break up the query string, and encode each section
} -body {
::ns::url::encodeQueryString "arg1=value1&arg2=value2"
} -result {arg1=value1&arg2=value2}
test encodeQueryString-1.2 {
With an equal that needs escaping
} -body {
::ns::url::encodeQueryString "arg1=value11=value12&arg2=value2"
} -result {arg1=value11%3dvalue12&arg2=value2}
# ========================================
test encode-1.1 {
Basic encoding of url, in query mode
} -body {
ns_urlencode http://www.aolserver.com/
} -result {http%3a%2f%2fwww%2eaolserver%2ecom%2f}
test encode-1.2 {
} -body {
;# 0xe9 is the iso8859-1 for lowercase accented 'e'
set data [ns_urlencode [format %c 0xe9]]
} -result {%c3%a9}
test encode-2.1 {
Test that query mode encodes the entire string as a query
} -body {
ns_urlencode "http://www.aolserver.com/bob?this is a=query"
} -result {http%3a%2f%2fwww%2eaolserver%2ecom%2fbob%3fthis+is+a%3dquery}
test encode-3.1 {
Mode url should not encode the url part with query rules
} -body {
ns_urlencode -url http://www.aolserver.com/bob
} -result {http://www.aolserver.com/bob}
test encode-3.2 {
Url mode, with a query component
} -body {
ns_urlencode -url {http://www.aolserver.com/bob?arg1 2}
} -result {http://www.aolserver.com/bob?arg1+2}
test encode-3.3 {
A space in the url part is escaped as %20, instead of a plus
} -body {
ns_urlencode -url {http://www.aolserver.com/p1 p2}
} -result {http://www.aolserver.com/p1%20p2}
test encode-3.4 {
The first equal sign of a query element need not be escaped
} -body {
ns_urlencode -url "page?arg1=value1"
} -result {page?arg1=value1}
test encode-3.4 {
The second equal sign in a query component needs to be escaped
} -body {
ns_urlencode -url "page?arg1=value1=value2"
} -result {page?arg1=value1%3dvalue2}
test encode-3.5 {
Multiple query elements get quoted ok
} -body {
ns_urlencode -url "page?arg1=value1&arg2=value 2"
} -result {page?arg1=value1&arg2=value+2}
## Fragments
test encode-5.1 {
Http, with fragment
} -body {
ns_urlencode -url "http://a.server.com/page#label"
} -result {http://a.server.com/page#label}
test encode-5.2 {
Http, with fragment and a # that needs escaping
} -body {
ns_urlencode -url "http://a.server.com/page#notlabel#label"
} -result {http://a.server.com/page#notlabel%23label}
test encode-5.3 {
Anything after the # is encoded as a fragment identifier
} -body {
ns_urlencode -url "http://a.server.com/page#sign?arg1=a"
} -result "http://a.server.com/page#sign%3farg1%3da"
## Mode: html
# Used to encode a url for use in an html page
# Otherwise, performs the same encodings as -url
test encode-6.1 {
Html mode, the & should be replaced by &
} -body {
ns_urlencode -html "page?arg1=value1&arg2=value 2"
} -result {page?arg1=value1&arg2=value+2}
## Special case tests
test encode-7.1 {
No encoding needed for ~
} -body {
ns_urlencode -url "ftp://a.server.com/~joe home%page"
} -result {ftp://a.server.com/~joe%20home%25page}
test encode-7.2 {
Empty query should still include the ?
} -body {
ns_urlencode -url "http://a.server.com/page?"
} -result {http://a.server.com/page?}
test encode-7.3 {
Empty fragment should still include the #
} -body {
ns_urlencode -url "http://a.server.com/page#"
} -result {http://a.server.com/page#}
test encode-7.4 {
Handle username@host:port
} -body {
ns_urlencode -url "ftp://rseeger@a.server.com:80/page"
} -result {ftp://rseeger@a.server.com:80/page}
# ====================
test encode-1.1E {
Throw an error if no args are supplied
} -body {
ns_urlencode
} -returnCodes error -match glob -result {wrong # args: should be *}
test encode-1.1E {
Throw an error if invalid option is specified
} -body {
ns_urlencode -notvalid bob
} -returnCodes error -match glob -result {invalid option "-notvalid": should be *}
# ========================================
test decode-1.1 {
Basic decoding of url
} -body {
ns_urldecode http%3a%2f%2fwww%2eaolserver%2ecom%2f
} -result {http://www.aolserver.com/}
test decode-1.2 {
Don't decode characters twice
} -body {
# %23 is the code for #
# %2523 is that, encoded
ns_urldecode {%2523}
} -result {%23}
test decode-2.1 {
Decode with charset: utf-8 (the default)
} -body {
ns_urldecode -charset utf-8 %c3%a9
} -result [format %c 0xe9]
test decode-2.2 {
Decode with charset: iso8859-1
} -body {
string length [ns_urldecode -charset iso8859-1 %c3%a9]
} -result 2