1
|
/**
|
2
|
* Provides a method to encode any string into a URL-safe
|
3
|
* form.
|
4
|
* Non-ASCII characters are first encoded as sequences of
|
5
|
* two or three bytes, using the UTF-8 algorithm, before being
|
6
|
* encoded as %HH escapes.
|
7
|
*
|
8
|
* Created: 17 April 1997
|
9
|
* Author: Bert Bos <bert@w3.org>
|
10
|
*
|
11
|
* URLUTF8Encoder: http://www.w3.org/International/URLUTF8Encoder.java
|
12
|
*
|
13
|
* Copyright © 1997 World Wide Web Consortium, (Massachusetts
|
14
|
* Institute of Technology, European Research Consortium for
|
15
|
* Informatics and Mathematics, Keio University). All Rights Reserved.
|
16
|
* This work is distributed under the W3C® Software License [1] in the
|
17
|
* hope that it will be useful, but WITHOUT ANY WARRANTY; without even
|
18
|
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
19
|
* PURPOSE.
|
20
|
*
|
21
|
* [1] http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
|
22
|
*/
|
23
|
package eu.etaxonomy.cdm.common;
|
24
|
|
25
|
public class UrlUtf8Coder{
|
26
|
|
27
|
final static String[] hex = {
|
28
|
"%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07",
|
29
|
"%08", "%09", "%0a", "%0b", "%0c", "%0d", "%0e", "%0f",
|
30
|
"%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17",
|
31
|
"%18", "%19", "%1a", "%1b", "%1c", "%1d", "%1e", "%1f",
|
32
|
"%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27",
|
33
|
"%28", "%29", "%2a", "%2b", "%2c", "%2d", "%2e", "%2f",
|
34
|
"%30", "%31", "%32", "%33", "%34", "%35", "%36", "%37",
|
35
|
"%38", "%39", "%3a", "%3b", "%3c", "%3d", "%3e", "%3f",
|
36
|
"%40", "%41", "%42", "%43", "%44", "%45", "%46", "%47",
|
37
|
"%48", "%49", "%4a", "%4b", "%4c", "%4d", "%4e", "%4f",
|
38
|
"%50", "%51", "%52", "%53", "%54", "%55", "%56", "%57",
|
39
|
"%58", "%59", "%5a", "%5b", "%5c", "%5d", "%5e", "%5f",
|
40
|
"%60", "%61", "%62", "%63", "%64", "%65", "%66", "%67",
|
41
|
"%68", "%69", "%6a", "%6b", "%6c", "%6d", "%6e", "%6f",
|
42
|
"%70", "%71", "%72", "%73", "%74", "%75", "%76", "%77",
|
43
|
"%78", "%79", "%7a", "%7b", "%7c", "%7d", "%7e", "%7f",
|
44
|
"%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87",
|
45
|
"%88", "%89", "%8a", "%8b", "%8c", "%8d", "%8e", "%8f",
|
46
|
"%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97",
|
47
|
"%98", "%99", "%9a", "%9b", "%9c", "%9d", "%9e", "%9f",
|
48
|
"%a0", "%a1", "%a2", "%a3", "%a4", "%a5", "%a6", "%a7",
|
49
|
"%a8", "%a9", "%aa", "%ab", "%ac", "%ad", "%ae", "%af",
|
50
|
"%b0", "%b1", "%b2", "%b3", "%b4", "%b5", "%b6", "%b7",
|
51
|
"%b8", "%b9", "%ba", "%bb", "%bc", "%bd", "%be", "%bf",
|
52
|
"%c0", "%c1", "%c2", "%c3", "%c4", "%c5", "%c6", "%c7",
|
53
|
"%c8", "%c9", "%ca", "%cb", "%cc", "%cd", "%ce", "%cf",
|
54
|
"%d0", "%d1", "%d2", "%d3", "%d4", "%d5", "%d6", "%d7",
|
55
|
"%d8", "%d9", "%da", "%db", "%dc", "%dd", "%de", "%df",
|
56
|
"%e0", "%e1", "%e2", "%e3", "%e4", "%e5", "%e6", "%e7",
|
57
|
"%e8", "%e9", "%ea", "%eb", "%ec", "%ed", "%ee", "%ef",
|
58
|
"%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
|
59
|
"%f8", "%f9", "%fa", "%fb", "%fc", "%fd", "%fe", "%ff"
|
60
|
};
|
61
|
|
62
|
/**
|
63
|
* Encode a string to the "x-www-form-urlencoded" form, enhanced
|
64
|
* with the UTF-8-in-URL proposal. This is what happens:
|
65
|
*
|
66
|
* <ul>
|
67
|
* <li><p>The ASCII characters 'a' through 'z', 'A' through 'Z',
|
68
|
* and '0' through '9' remain the same.
|
69
|
*
|
70
|
* <li><p>The unreserved characters - _ . ! ~ * ' ( ) remain the same.
|
71
|
*
|
72
|
* <li><p>The space character ' ' is converted into a plus sign '+'.
|
73
|
*
|
74
|
* <li><p>All other ASCII characters are converted into the
|
75
|
* 3-character string "%xy", where xy is
|
76
|
* the two-digit hexadecimal representation of the character
|
77
|
* code
|
78
|
*
|
79
|
* <li><p>All non-ASCII characters are encoded in two steps: first
|
80
|
* to a sequence of 2 or 3 bytes, using the UTF-8 algorithm;
|
81
|
* secondly each of these bytes is encoded as "%xx".
|
82
|
* </ul>
|
83
|
*
|
84
|
* @param s The string to be encoded
|
85
|
* @return The encoded string
|
86
|
*/
|
87
|
public static String encode(String s){
|
88
|
|
89
|
StringBuffer sbuf = new StringBuffer();
|
90
|
int len = s.length();
|
91
|
for (int i = 0; i < len; i++) {
|
92
|
int ch = s.charAt(i);
|
93
|
if ('A' <= ch && ch <= 'Z') { // 'A'..'Z'
|
94
|
sbuf.append((char)ch);
|
95
|
} else if ('a' <= ch && ch <= 'z') { // 'a'..'z'
|
96
|
sbuf.append((char)ch);
|
97
|
} else if ('0' <= ch && ch <= '9') { // '0'..'9'
|
98
|
sbuf.append((char)ch);
|
99
|
} else if (ch == ' ') { // space
|
100
|
sbuf.append(hex[ch]); //Note: changed from + to %20 for use according to http://www.doi.org/doi_handbook/2_Numbering.html#2.5.2.4
|
101
|
} else if (ch == '-' || ch == '_' // unreserved
|
102
|
|| ch == '.' || ch == '!'
|
103
|
|| ch == '~' || ch == '*'
|
104
|
|| ch == '\'' || ch == '('
|
105
|
|| ch == ')') {
|
106
|
sbuf.append((char)ch);
|
107
|
} else if (ch <= 0x007f) { // other ASCII
|
108
|
sbuf.append(hex[ch]);
|
109
|
} else if (ch <= 0x07FF) { // non-ASCII <= 0x7FF
|
110
|
sbuf.append(hex[0xc0 | (ch >> 6)]);
|
111
|
sbuf.append(hex[0x80 | (ch & 0x3F)]);
|
112
|
} else { // 0x7FF < ch <= 0xFFFF
|
113
|
sbuf.append(hex[0xe0 | (ch >> 12)]);
|
114
|
sbuf.append(hex[0x80 | ((ch >> 6) & 0x3F)]);
|
115
|
sbuf.append(hex[0x80 | (ch & 0x3F)]);
|
116
|
}
|
117
|
}
|
118
|
return sbuf.toString();
|
119
|
}
|
120
|
|
121
|
/*
|
122
|
* Created: 17 April 1997
|
123
|
* Author: Bert Bos <bert@w3.org>
|
124
|
*
|
125
|
* unescape: http://www.w3.org/International/unescape.java
|
126
|
*
|
127
|
* Copyright © 1997 World Wide Web Consortium, (Massachusetts
|
128
|
* Institute of Technology, European Research Consortium for
|
129
|
* Informatics and Mathematics, Keio University). All Rights Reserved.
|
130
|
* This work is distributed under the W3C® Software License [1] in the
|
131
|
* hope that it will be useful, but WITHOUT ANY WARRANTY; without even
|
132
|
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
133
|
* PURPOSE.
|
134
|
*
|
135
|
* [1] http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
|
136
|
*/
|
137
|
public static String unescape(String s) {
|
138
|
StringBuffer sbuf = new StringBuffer () ;
|
139
|
int l = s.length() ;
|
140
|
int ch = -1 ;
|
141
|
int b, sumb = 0;
|
142
|
for (int i = 0, more = -1 ; i < l ; i++) {
|
143
|
/* Get next byte b from URL segment s */
|
144
|
switch (ch = s.charAt(i)) {
|
145
|
case '%':
|
146
|
ch = s.charAt (++i) ;
|
147
|
int hb = (Character.isDigit ((char) ch)
|
148
|
? ch - '0'
|
149
|
: 10+Character.toLowerCase((char) ch) - 'a') & 0xF ;
|
150
|
ch = s.charAt (++i) ;
|
151
|
int lb = (Character.isDigit ((char) ch)
|
152
|
? ch - '0'
|
153
|
: 10+Character.toLowerCase ((char) ch)-'a') & 0xF ;
|
154
|
b = (hb << 4) | lb ;
|
155
|
break ;
|
156
|
case '+':
|
157
|
b = ' ' ;
|
158
|
break ;
|
159
|
default:
|
160
|
b = ch ;
|
161
|
}
|
162
|
/* Decode byte b as UTF-8, sumb collects incomplete chars */
|
163
|
if ((b & 0xc0) == 0x80) { // 10xxxxxx (continuation byte)
|
164
|
sumb = (sumb << 6) | (b & 0x3f) ; // Add 6 bits to sumb
|
165
|
if (--more == 0)
|
166
|
{
|
167
|
sbuf.append((char) sumb) ; // Add char to sbuf
|
168
|
}
|
169
|
} else if ((b & 0x80) == 0x00) { // 0xxxxxxx (yields 7 bits)
|
170
|
sbuf.append((char) b) ; // Store in sbuf
|
171
|
} else if ((b & 0xe0) == 0xc0) { // 110xxxxx (yields 5 bits)
|
172
|
sumb = b & 0x1f;
|
173
|
more = 1; // Expect 1 more byte
|
174
|
} else if ((b & 0xf0) == 0xe0) { // 1110xxxx (yields 4 bits)
|
175
|
sumb = b & 0x0f;
|
176
|
more = 2; // Expect 2 more bytes
|
177
|
} else if ((b & 0xf8) == 0xf0) { // 11110xxx (yields 3 bits)
|
178
|
sumb = b & 0x07;
|
179
|
more = 3; // Expect 3 more bytes
|
180
|
} else if ((b & 0xfc) == 0xf8) { // 111110xx (yields 2 bits)
|
181
|
sumb = b & 0x03;
|
182
|
more = 4; // Expect 4 more bytes
|
183
|
} else /*if ((b & 0xfe) == 0xfc)*/ { // 1111110x (yields 1 bit)
|
184
|
sumb = b & 0x01;
|
185
|
more = 5; // Expect 5 more bytes
|
186
|
}
|
187
|
/* We don't test if the UTF-8 encoding is well-formed */
|
188
|
}
|
189
|
return sbuf.toString() ;
|
190
|
}
|
191
|
}
|