View Javadoc
1   /*
2    * Copyright (C) 2013 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5    * in compliance with the License. You may obtain a copy of the License at
6    *
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software distributed under the License
10   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11   * or implied. See the License for the specific language governing permissions and limitations under
12   * the License.
13   */
14  
15  package com.google.common.base;
16  
17  import static com.google.common.base.Preconditions.checkPositionIndexes;
18  
19  import com.google.common.annotations.Beta;
20  import com.google.common.annotations.GwtCompatible;
21  
22  /**
23   * Low-level, high-performance utility methods related to the {@linkplain Charsets#UTF_8 UTF-8}
24   * character encoding. UTF-8 is defined in section D92 of
25   * <a href="http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf">The Unicode Standard Core
26   * Specification, Chapter 3</a>.
27   *
28   * <p>The variant of UTF-8 implemented by this class is the restricted definition of UTF-8
29   * introduced in Unicode 3.1. One implication of this is that it rejects
30   * <a href="http://www.unicode.org/versions/corrigendum1.html">"non-shortest form"</a> byte
31   * sequences, even though the JDK decoder may accept them.
32   *
33   * @author Martin Buchholz
34   * @author Clément Roux
35   * @since 16.0
36   */
37  @Beta
38  @GwtCompatible
39  public final class Utf8 {
40    /**
41     * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
42     * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
43     * both time and space.
44     *
45     * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
46     *     surrogates)
47     */
48    public static int encodedLength(CharSequence sequence) {
49      // Warning to maintainers: this implementation is highly optimized.
50      int utf16Length = sequence.length();
51      int utf8Length = utf16Length;
52      int i = 0;
53  
54      // This loop optimizes for pure ASCII.
55      while (i < utf16Length && sequence.charAt(i) < 0x80) {
56        i++;
57      }
58  
59      // This loop optimizes for chars less than 0x800.
60      for (; i < utf16Length; i++) {
61        char c = sequence.charAt(i);
62        if (c < 0x800) {
63          utf8Length += ((0x7f - c) >>> 31);  // branch free!
64        } else {
65          utf8Length += encodedLengthGeneral(sequence, i);
66          break;
67        }
68      }
69  
70      if (utf8Length < utf16Length) {
71        // Necessary and sufficient condition for overflow because of maximum 3x expansion
72        throw new IllegalArgumentException("UTF-8 length does not fit in int: "
73                                           + (utf8Length + (1L << 32)));
74      }
75      return utf8Length;
76    }
77  
78    private static int encodedLengthGeneral(CharSequence sequence, int start) {
79      int utf16Length = sequence.length();
80      int utf8Length = 0;
81      for (int i = start; i < utf16Length; i++) {
82        char c = sequence.charAt(i);
83        if (c < 0x800) {
84          utf8Length += (0x7f - c) >>> 31; // branch free!
85        } else {
86          utf8Length += 2;
87          // jdk7+: if (Character.isSurrogate(c)) {
88          if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
89            // Check that we have a well-formed surrogate pair.
90            int cp = Character.codePointAt(sequence, i);
91            if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
92              throw new IllegalArgumentException("Unpaired surrogate at index " + i);
93            }
94            i++;
95          }
96        }
97      }
98      return utf8Length;
99    }
100 
101   /**
102    * Returns {@code true} if {@code bytes} is a <i>well-formed</i> UTF-8 byte sequence according to
103    * Unicode 6.0. Note that this is a stronger criterion than simply whether the bytes can be
104    * decoded. For example, some versions of the JDK decoder will accept "non-shortest form" byte
105    * sequences, but encoding never reproduces these. Such byte sequences are <i>not</i> considered
106    * well-formed.
107    *
108    * <p>This method returns {@code true} if and only if {@code Arrays.equals(bytes, new
109    * String(bytes, UTF_8).getBytes(UTF_8))} does, but is more efficient in both time and space.
110    */
111   public static boolean isWellFormed(byte[] bytes) {
112     return isWellFormed(bytes, 0, bytes.length);
113   }
114 
115   /**
116    * Returns whether the given byte array slice is a well-formed UTF-8 byte sequence, as defined by
117    * {@link #isWellFormed(byte[])}. Note that this can be false even when {@code
118    * isWellFormed(bytes)} is true.
119    *
120    * @param bytes the input buffer
121    * @param off the offset in the buffer of the first byte to read
122    * @param len the number of bytes to read from the buffer
123    */
124   public static boolean isWellFormed(byte[] bytes, int off, int len) {
125     int end = off + len;
126     checkPositionIndexes(off, end, bytes.length);
127     // Look for the first non-ASCII character.
128     for (int i = off; i < end; i++) {
129       if (bytes[i] < 0) {
130         return isWellFormedSlowPath(bytes, i, end);
131       }
132     }
133     return true;
134   }
135 
136   private static boolean isWellFormedSlowPath(byte[] bytes, int off, int end) {
137     int index = off;
138     while (true) {
139       int byte1;
140 
141       // Optimize for interior runs of ASCII bytes.
142       do {
143         if (index >= end) {
144           return true;
145         }
146       } while ((byte1 = bytes[index++]) >= 0);
147 
148       if (byte1 < (byte) 0xE0) {
149         // Two-byte form.
150         if (index == end) {
151           return false;
152         }
153         // Simultaneously check for illegal trailing-byte in leading position
154         // and overlong 2-byte form.
155         if (byte1 < (byte) 0xC2 || bytes[index++] > (byte) 0xBF) {
156           return false;
157         }
158       } else if (byte1 < (byte) 0xF0) {
159         // Three-byte form.
160         if (index + 1 >= end) {
161           return false;
162         }
163         int byte2 = bytes[index++];
164         if (byte2 > (byte) 0xBF
165             // Overlong? 5 most significant bits must not all be zero.
166             || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
167             // Check for illegal surrogate codepoints.
168             || (byte1 == (byte) 0xED && (byte) 0xA0 <= byte2)
169             // Third byte trailing-byte test.
170             || bytes[index++] > (byte) 0xBF) {
171           return false;
172         }
173       } else {
174         // Four-byte form.
175         if (index + 2 >= end) {
176           return false;
177         }
178         int byte2 = bytes[index++];
179         if (byte2 > (byte) 0xBF
180             // Check that 1 <= plane <= 16. Tricky optimized form of:
181             // if (byte1 > (byte) 0xF4
182             //     || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90
183             //     || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
184             || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
185             // Third byte trailing-byte test
186             || bytes[index++] > (byte) 0xBF
187             // Fourth byte trailing-byte test
188             || bytes[index++] > (byte) 0xBF) {
189           return false;
190         }
191       }
192     }
193   }
194 
195   private Utf8() {}
196 }