1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package com.google.common.base;
18
19 import com.google.caliper.BeforeExperiment;
20 import com.google.caliper.Benchmark;
21 import com.google.caliper.Param;
22
23 import java.util.Random;
24
25
26
27
28
29
30
31 public class Utf8Benchmark {
32
33 static class MaxCodePoint {
34 final int value;
35
36
37
38
39
40
41 private static int decode(String userFriendly) {
42 try {
43 return Integer.decode(userFriendly);
44 } catch (NumberFormatException ignored) {
45 if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
46
47 return 0x80;
48 } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
49
50
51 return 0x90;
52 } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
53
54 return 0x100;
55 } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
56
57 return 0x800;
58 } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
59
60 return Character.MIN_SUPPLEMENTARY_CODE_POINT;
61 } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
62
63 return Character.MAX_CODE_POINT;
64 } else {
65 throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
66 }
67 }
68 }
69
70 public static MaxCodePoint valueOf(String userFriendly) {
71 return new MaxCodePoint(userFriendly);
72 }
73
74 public MaxCodePoint(String userFriendly) {
75 value = decode(userFriendly);
76 }
77 }
78
79
80
81
82
83
84 @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
85
86 @Param({"100"}) int stringCount;
87 @Param({"16384"}) int charCount;
88 private CharSequence[] seqs;
89 private String[] strings;
90 private byte[][] byteArrays;
91
92
93
94
95
96
97 @BeforeExperiment void setUp() {
98 final long seed = 99;
99 final Random rnd = new Random(seed);
100 seqs = new CharSequence[stringCount];
101 strings = new String[stringCount];
102 byteArrays = new byte[stringCount][];
103 for (int i = 0; i < stringCount; i++) {
104 StringBuilder sb = new StringBuilder();
105 for (int j = 0; j < charCount; j++) {
106 int codePoint;
107
108 do {
109 codePoint = rnd.nextInt(maxCodePoint.value);
110 } while (isSurrogate(codePoint));
111 sb.appendCodePoint(codePoint);
112 }
113 seqs[i] = sb;
114 strings[i] = sb.toString();
115 byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
116 }
117 }
118
119
120
121
122
123
124
125 @Benchmark void isWellFormed(int reps) {
126 for (int i = 0; i < reps; i++) {
127 for (byte[] byteArray : byteArrays) {
128 if (!Utf8.isWellFormed(byteArray)) {
129 throw new Error("unexpected invalid UTF-8");
130 }
131 }
132 }
133 }
134
135
136
137
138
139
140
141 @Benchmark void lengthOfString(int reps) {
142 for (int i = 0; i < reps; i++) {
143 for (String string : strings) {
144 if (Utf8.encodedLength(string) == 1237482374) {
145 throw new Error("Unlikely! We're just defeating the optimizer!");
146 }
147 }
148 }
149 }
150
151
152
153
154
155
156
157 @Benchmark void lengthOfStringBuilder(int reps) {
158 for (int i = 0; i < reps; i++) {
159 for (CharSequence seq : seqs) {
160 if (Utf8.encodedLength(seq) == 1237482374) {
161 throw new Error("Unlikely! We're just defeating the optimizer!");
162 }
163 }
164 }
165 }
166
167
168 private boolean isSurrogate(int c) {
169 return (Character.MIN_HIGH_SURROGATE <= c &&
170 c <= Character.MAX_LOW_SURROGATE);
171 }
172 }