View Javadoc
1   /*
2    * Copyright (C) 2011 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.base;
18  
19  import com.google.caliper.BeforeExperiment;
20  import com.google.caliper.Benchmark;
21  import com.google.caliper.Param;
22  
23  import java.util.Random;
24  
25  /**
26   * Benchmark for the {@link Utf8} class.
27   *
28   *
29   * @author Martin Buchholz
30   */
31  public class Utf8Benchmark {
32  
33    static class MaxCodePoint {
34      final int value;
35  
36      /**
37       * Convert the input string to a code point.  Accepts regular
38       * decimal numerals, hex strings, and some symbolic names
39       * meaningful to humans.
40       */
41      private static int decode(String userFriendly) {
42        try {
43          return Integer.decode(userFriendly);
44        } catch (NumberFormatException ignored) {
45          if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
46            // 1-byte UTF-8 sequences - "American" ASCII text
47            return 0x80;
48          } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
49            // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
50            // sequences - "Western European" text
51            return 0x90;
52          } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
53            // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
54            return 0x100;
55          } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
56            // Mostly 2-byte UTF-8 sequences - "European" text
57            return 0x800;
58          } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
59            // Mostly 3-byte UTF-8 sequences - "Asian" text
60            return Character.MIN_SUPPLEMENTARY_CODE_POINT;
61          } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
62            // Mostly 4-byte UTF-8 sequences - "rare exotic" text
63            return Character.MAX_CODE_POINT;
64          } else {
65            throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
66          }
67        }
68      }
69  
70      public static MaxCodePoint valueOf(String userFriendly) {
71        return new MaxCodePoint(userFriendly);
72      }
73  
74      public MaxCodePoint(String userFriendly) {
75        value = decode(userFriendly);
76      }
77    }
78  
79    /**
80     * The default values of maxCodePoint below provide pretty good
81     * performance models of different kinds of common human text.
82     * @see MaxCodePoint#decode
83     */
84    @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
85  
86    @Param({"100"}) int stringCount;
87    @Param({"16384"}) int charCount;
88    private CharSequence[] seqs;  // actually, all StringBuilders
89    private String[] strings;
90    private byte[][] byteArrays;
91  
92    /**
93     * Compute arrays of valid unicode text, and store it in 3 forms:
94     * byte arrays, Strings, and StringBuilders (in a CharSequence[] to
95     * make it a little harder for the JVM).
96     */
97    @BeforeExperiment void setUp() {
98      final long seed = 99;
99      final Random rnd = new Random(seed);
100     seqs = new CharSequence[stringCount];
101     strings = new String[stringCount];
102     byteArrays = new byte[stringCount][];
103     for (int i = 0; i < stringCount; i++) {
104       StringBuilder sb = new StringBuilder();
105       for (int j = 0; j < charCount; j++) {
106         int codePoint;
107         // discard illegal surrogate "codepoints"
108         do {
109           codePoint = rnd.nextInt(maxCodePoint.value);
110         } while (isSurrogate(codePoint));
111         sb.appendCodePoint(codePoint);
112       }
113       seqs[i] = sb;
114       strings[i] = sb.toString();
115       byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
116     }
117   }
118 
119   /**
120    * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays
121    * containing pseudo-randomly-generated codePoints less than {@code
122    * maxCodePoint}.  A constant seed is used, so separate runs perform
123    * identical computations.
124    */
125   @Benchmark void isWellFormed(int reps) {
126     for (int i = 0; i < reps; i++) {
127       for (byte[] byteArray : byteArrays) {
128         if (!Utf8.isWellFormed(byteArray)) {
129           throw new Error("unexpected invalid UTF-8");
130         }
131       }
132     }
133   }
134 
135   /**
136    * Benchmarks {@link Utf8#length} on valid strings containing
137    * pseudo-randomly-generated codePoints less than {@code
138    * maxCodePoint}.  A constant seed is used, so separate runs perform
139    * identical computations.
140    */
141   @Benchmark void lengthOfString(int reps) {
142     for (int i = 0; i < reps; i++) {
143       for (String string : strings) {
144         if (Utf8.encodedLength(string) == 1237482374) {
145           throw new Error("Unlikely! We're just defeating the optimizer!");
146         }
147       }
148     }
149   }
150 
151   /**
152    * Benchmarks {@link Utf8#length} on valid StringBuilders containing
153    * pseudo-randomly-generated codePoints less than {@code
154    * maxCodePoint}.  A constant seed is used, so separate runs perform
155    * identical computations.
156    */
157   @Benchmark void lengthOfStringBuilder(int reps) {
158     for (int i = 0; i < reps; i++) {
159       for (CharSequence seq : seqs) {
160         if (Utf8.encodedLength(seq) == 1237482374) {
161           throw new Error("Unlikely! We're just defeating the optimizer!");
162         }
163       }
164     }
165   }
166 
167   /** Character.isSurrogate was added in Java SE 7. */
168   private boolean isSurrogate(int c) {
169     return (Character.MIN_HIGH_SURROGATE <= c &&
170             c <= Character.MAX_LOW_SURROGATE);
171   }
172 }