1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.google.common.base;
18
19 import com.google.common.annotations.GwtCompatible;
20 import com.google.common.annotations.GwtIncompatible;
21
22 import junit.framework.TestCase;
23
24 import java.util.Arrays;
25 import java.util.HashMap;
26 import java.util.Random;
27
28
29
30
31
32
33
34
35 @GwtCompatible(emulated = true)
36 public class Utf8Test extends TestCase {
37 public void testEncodedLength_validStrings() {
38 assertEquals(0, Utf8.encodedLength(""));
39 assertEquals(11, Utf8.encodedLength("Hello world"));
40 assertEquals(8, Utf8.encodedLength("Résumé"));
41 assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
42 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
43 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
44 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
45 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
46 + "哈都拕人翻譯做好多話。"));
47
48 assertEquals(4, Utf8.encodedLength(
49 newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
50 }
51
52 @GwtIncompatible("StringBuilder.appendCodePoint()")
53 public void testEncodedLength_validStrings2() {
54 HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
55 utf8Lengths.put(0x00, 1);
56 utf8Lengths.put(0x7f, 1);
57 utf8Lengths.put(0x80, 2);
58 utf8Lengths.put(0x7ff, 2);
59 utf8Lengths.put(0x800, 3);
60 utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
61 utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
62 utf8Lengths.put(Character.MAX_CODE_POINT, 4);
63
64 Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
65 StringBuilder sb = new StringBuilder();
66 Random rnd = new Random();
67 for (int trial = 0; trial < 100; trial++) {
68 sb.setLength(0);
69 int utf8Length = 0;
70 for (int i = 0; i < 6; i++) {
71 Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
72 sb.appendCodePoint(randomCodePoint);
73 utf8Length += utf8Lengths.get(randomCodePoint);
74 if (utf8Length != Utf8.encodedLength(sb)) {
75 StringBuilder repro = new StringBuilder();
76 for (int j = 0; j < sb.length(); j++) {
77 repro.append(" " + (int) sb.charAt(j));
78 }
79 assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
80 }
81 }
82 }
83 }
84
85 public void testEncodedLength_invalidStrings() {
86 testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
87 testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
88 testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
89 testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
90 testEncodedLengthFails(
91 newString(
92 Character.MIN_HIGH_SURROGATE,
93 Character.MIN_HIGH_SURROGATE), 0);
94 }
95
96 private static void testEncodedLengthFails(String invalidString,
97 int invalidCodePointIndex) {
98 try {
99 Utf8.encodedLength(invalidString);
100 fail();
101 } catch (IllegalArgumentException expected) {
102 assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
103 expected.getMessage());
104 }
105 }
106
107
108 private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
109 0x007f - 0x0000 + 1;
110
111
112 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
113 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
114
115
116 private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
117 0x07FF - 0x0080 + 1;
118
119
120 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
121
122 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
123
124 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
125
126
127 private static final long THREE_BYTE_SURROGATES = 2 * 1024;
128
129
130 private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
131 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
132
133
134 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
135
136 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
137
138 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
139 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
140
141 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
142
143
144 private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
145 0x10FFFF - 0x10000L + 1;
146
147
148 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
149
150 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
151
152 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
153 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
154
155 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
156
157 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
158 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
159 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
160
161 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
162
163
164 @GwtIncompatible("java.nio.charset.Charset")
165 public void testIsWellFormed_1Byte() {
166 testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
167 }
168
169
170 @GwtIncompatible("java.nio.charset.Charset")
171 public void testIsWellFormed_2Bytes() {
172 testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
173 }
174
175
176 @GwtIncompatible("java.nio.charset.Charset")
177 public void testIsWellFormed_3Bytes() {
178 testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
179 }
180
181
182
183
184
185
186 public void testIsWellFormed_4BytesSamples() {
187
188 assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
189
190 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
191 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
192
193 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
194 assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
195 }
196
197
198 public void testSomeSequences() {
199
200 assertWellFormed();
201
202 assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F);
203
204 assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2);
205
206 assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63);
207
208
209 assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
210
211
212 assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
213 0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
214 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
215
216 assertNotWellFormed(-1, 0, -1, 0);
217 }
218
219 public void testShardsHaveExpectedRoundTrippables() {
220
221 long actual = 0;
222 for (long expected : generateFourByteShardsExpectedRunnables()) {
223 actual += expected;
224 }
225 assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
226 }
227
228 private String newString(char... chars) {
229 return new String(chars);
230 }
231
232 private byte[] toByteArray(int... bytes) {
233 byte[] realBytes = new byte[bytes.length];
234 for (int i = 0; i < bytes.length; i++) {
235 realBytes[i] = (byte) bytes[i];
236 }
237 return realBytes;
238 }
239
240 private void assertWellFormed(int... bytes) {
241 assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
242 }
243
244 private void assertNotWellFormed(int... bytes) {
245 assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
246 }
247
248 private static long[] generateFourByteShardsExpectedRunnables() {
249 long[] expected = new long[128];
250
251 for (int i = 0; i <= 63; i++) {
252 expected[i] = 5300224;
253 }
254
255 for (int i = 97; i <= 111; i++) {
256 expected[i] = 2342912;
257 }
258
259 for (int i = 113; i <= 117; i++) {
260 expected[i] = 1048576;
261 }
262
263 expected[112] = 786432;
264 expected[118] = 786432;
265 expected[119] = 1048576;
266 expected[120] = 458752;
267 expected[121] = 524288;
268 expected[122] = 65536;
269
270 return expected;
271 }
272
273
274
275
276
277
278
279
280 @GwtIncompatible("java.nio.charset.Charset")
281 private static void testBytes(int numBytes, long expectedCount) {
282 testBytes(numBytes, expectedCount, 0, -1);
283 }
284
285
286
287
288
289
290
291
292
293
294
295
296 @GwtIncompatible("java.nio.charset.Charset")
297 private static void testBytes(int numBytes, long expectedCount, long start,
298 long lim) {
299 byte[] bytes = new byte[numBytes];
300 if (lim == -1) {
301 lim = 1L << (numBytes * 8);
302 }
303 long countRoundTripped = 0;
304 for (long byteChar = start; byteChar < lim; byteChar++) {
305 long tmpByteChar = byteChar;
306 for (int i = 0; i < numBytes; i++) {
307 bytes[bytes.length - i - 1] = (byte) tmpByteChar;
308 tmpByteChar = tmpByteChar >> 8;
309 }
310 boolean isRoundTrippable = Utf8.isWellFormed(bytes);
311 assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
312 String s = new String(bytes, Charsets.UTF_8);
313 byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
314 boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
315
316 if (bytesEqual != isRoundTrippable) {
317 fail();
318 }
319 if (isRoundTrippable) {
320 countRoundTripped++;
321 }
322 }
323 assertEquals(expectedCount, countRoundTripped);
324 }
325 }