View Javadoc
1   /*
2    * Copyright (C) 2008 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.net;
18  
19  import static com.google.common.base.Preconditions.checkNotNull;
20  
21  import com.google.common.annotations.Beta;
22  import com.google.common.annotations.GwtCompatible;
23  import com.google.common.escape.UnicodeEscaper;
24  
25  /**
26   * A {@code UnicodeEscaper} that escapes some set of Java characters using a
27   * UTF-8 based percent encoding scheme. The set of safe characters (those which
28   * remain unescaped) can be specified on construction.
29   *
30   * <p>This class is primarily used for creating URI escapers in {@link
31   * UrlEscapers} but can be used directly if required. While URI escapers impose
32   * specific semantics on which characters are considered 'safe', this class has
33   * a minimal set of restrictions.
34   *
35   * <p>When escaping a String, the following rules apply:
36   * <ul>
37   * <li>All specified safe characters remain unchanged.
38   * <li>If {@code plusForSpace} was specified, the space character " " is
39   *     converted into a plus sign {@code "+"}.
40   * <li>All other characters are converted into one or more bytes using UTF-8
41   *     encoding and each byte is then represented by the 3-character string
42   *     "%XX", where "XX" is the two-digit, uppercase, hexadecimal representation
43   *     of the byte value.
44   * </ul>
45   *
46   * <p>For performance reasons the only currently supported character encoding of
47   * this class is UTF-8.
48   *
49   * <p><b>Note:</b> This escaper produces uppercase hexadecimal sequences. From
50   * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
51   * <i>"URI producers and normalizers should use uppercase hexadecimal digits
52   * for all percent-encodings."</i>
53   *
54   * @author David Beaumont
55   * @since 15.0
56   */
57  @Beta
58  @GwtCompatible
59  public final class PercentEscaper extends UnicodeEscaper {
60  
61    // In some escapers spaces are escaped to '+'
62    private static final char[] PLUS_SIGN = { '+' };
63  
64    // Percent escapers output upper case hex digits (uri escapers require this).
65    private static final char[] UPPER_HEX_DIGITS =
66        "0123456789ABCDEF".toCharArray();
67  
68    /**
69     * If true we should convert space to the {@code +} character.
70     */
71    private final boolean plusForSpace;
72  
73    /**
74     * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
75     * true then {@code c} should remain unmodified in the output. If
76     * {@code c > safeOctets.length} then it should be escaped.
77     */
78    private final boolean[] safeOctets;
79  
80    /**
81     * Constructs a percent escaper with the specified safe characters and
82     * optional handling of the space character.
83     *
84     * <p>Not that it is allowed, but not necessarily desirable to specify {@code %}
85     * as a safe character. This has the effect of creating an escaper which has no
86     * well defined inverse but it can be useful when escaping additional characters.
87     *
88     * @param safeChars a non null string specifying additional safe characters
89     *        for this escaper (the ranges 0..9, a..z and A..Z are always safe and
90     *        should not be specified here)
91     * @param plusForSpace true if ASCII space should be escaped to {@code +}
92     *        rather than {@code %20}
93     * @throws IllegalArgumentException if any of the parameters were invalid
94     */
95    public PercentEscaper(String safeChars, boolean plusForSpace) {
96      // TODO(user): Switch to static factory methods for creation now that class is final.
97      // TODO(user): Support escapers where alphanumeric chars are not safe.
98      checkNotNull(safeChars);  // eager for GWT.
99      // Avoid any misunderstandings about the behavior of this escaper
100     if (safeChars.matches(".*[0-9A-Za-z].*")) {
101       throw new IllegalArgumentException(
102           "Alphanumeric characters are always 'safe' and should not be " +
103           "explicitly specified");
104     }
105     safeChars += "abcdefghijklmnopqrstuvwxyz" +
106                  "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
107                  "0123456789";
108     // Avoid ambiguous parameters. Safe characters are never modified so if
109     // space is a safe character then setting plusForSpace is meaningless.
110     if (plusForSpace && safeChars.contains(" ")) {
111       throw new IllegalArgumentException(
112           "plusForSpace cannot be specified when space is a 'safe' character");
113     }
114     this.plusForSpace = plusForSpace;
115     this.safeOctets = createSafeOctets(safeChars);
116   }
117 
118   /**
119    * Creates a boolean array with entries corresponding to the character values
120    * specified in safeChars set to true. The array is as small as is required to
121    * hold the given character information.
122    */
123   private static boolean[] createSafeOctets(String safeChars) {
124     int maxChar = -1;
125     char[] safeCharArray = safeChars.toCharArray();
126     for (char c : safeCharArray) {
127       maxChar = Math.max(c, maxChar);
128     }
129     boolean[] octets = new boolean[maxChar + 1];
130     for (char c : safeCharArray) {
131       octets[c] = true;
132     }
133     return octets;
134   }
135 
136   /*
137    * Overridden for performance. For unescaped strings this improved the
138    * performance of the uri escaper from ~760ns to ~400ns as measured by
139    * {@link CharEscapersBenchmark}.
140    */
141   @Override
142   protected int nextEscapeIndex(CharSequence csq, int index, int end) {
143     checkNotNull(csq);
144     for (; index < end; index++) {
145       char c = csq.charAt(index);
146       if (c >= safeOctets.length || !safeOctets[c]) {
147         break;
148       }
149     }
150     return index;
151   }
152 
153   /*
154    * Overridden for performance. For unescaped strings this improved the
155    * performance of the uri escaper from ~400ns to ~170ns as measured by
156    * {@link CharEscapersBenchmark}.
157    */
158   @Override
159   public String escape(String s) {
160     checkNotNull(s);
161     int slen = s.length();
162     for (int index = 0; index < slen; index++) {
163       char c = s.charAt(index);
164       if (c >= safeOctets.length || !safeOctets[c]) {
165         return escapeSlow(s, index);
166       }
167     }
168     return s;
169   }
170 
171   /**
172    * Escapes the given Unicode code point in UTF-8.
173    */
174   @Override
175   protected char[] escape(int cp) {
176     // We should never get negative values here but if we do it will throw an
177     // IndexOutOfBoundsException, so at least it will get spotted.
178     if (cp < safeOctets.length && safeOctets[cp]) {
179       return null;
180     } else if (cp == ' ' && plusForSpace) {
181       return PLUS_SIGN;
182     } else if (cp <= 0x7F) {
183       // Single byte UTF-8 characters
184       // Start with "%--" and fill in the blanks
185       char[] dest = new char[3];
186       dest[0] = '%';
187       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
188       dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
189       return dest;
190     } else if (cp <= 0x7ff) {
191       // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
192       // Start with "%--%--" and fill in the blanks
193       char[] dest = new char[6];
194       dest[0] = '%';
195       dest[3] = '%';
196       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
197       cp >>>= 4;
198       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
199       cp >>>= 2;
200       dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
201       cp >>>= 4;
202       dest[1] = UPPER_HEX_DIGITS[0xC | cp];
203       return dest;
204     } else if (cp <= 0xffff) {
205       // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
206       // Start with "%E-%--%--" and fill in the blanks
207       char[] dest = new char[9];
208       dest[0] = '%';
209       dest[1] = 'E';
210       dest[3] = '%';
211       dest[6] = '%';
212       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
213       cp >>>= 4;
214       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
215       cp >>>= 2;
216       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
217       cp >>>= 4;
218       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
219       cp >>>= 2;
220       dest[2] = UPPER_HEX_DIGITS[cp];
221       return dest;
222     } else if (cp <= 0x10ffff) {
223       char[] dest = new char[12];
224       // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
225       // Start with "%F-%--%--%--" and fill in the blanks
226       dest[0] = '%';
227       dest[1] = 'F';
228       dest[3] = '%';
229       dest[6] = '%';
230       dest[9] = '%';
231       dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
232       cp >>>= 4;
233       dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
234       cp >>>= 2;
235       dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
236       cp >>>= 4;
237       dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
238       cp >>>= 2;
239       dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
240       cp >>>= 4;
241       dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
242       cp >>>= 2;
243       dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
244       return dest;
245     } else {
246       // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
247       throw new IllegalArgumentException(
248           "Invalid unicode character value " + cp);
249     }
250   }
251 }