1 /* 2 * Copyright (C) 2008 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.net; 18 19 import static com.google.common.base.Preconditions.checkNotNull; 20 21 import com.google.common.annotations.Beta; 22 import com.google.common.annotations.GwtCompatible; 23 import com.google.common.escape.UnicodeEscaper; 24 25 /** 26 * A {@code UnicodeEscaper} that escapes some set of Java characters using a 27 * UTF-8 based percent encoding scheme. The set of safe characters (those which 28 * remain unescaped) can be specified on construction. 29 * 30 * <p>This class is primarily used for creating URI escapers in {@link 31 * UrlEscapers} but can be used directly if required. While URI escapers impose 32 * specific semantics on which characters are considered 'safe', this class has 33 * a minimal set of restrictions. 34 * 35 * <p>When escaping a String, the following rules apply: 36 * <ul> 37 * <li>All specified safe characters remain unchanged. 38 * <li>If {@code plusForSpace} was specified, the space character " " is 39 * converted into a plus sign {@code "+"}. 40 * <li>All other characters are converted into one or more bytes using UTF-8 41 * encoding and each byte is then represented by the 3-character string 42 * "%XX", where "XX" is the two-digit, uppercase, hexadecimal representation 43 * of the byte value. 44 * </ul> 45 * 46 * <p>For performance reasons the only currently supported character encoding of 47 * this class is UTF-8. 48 * 49 * <p><b>Note:</b> This escaper produces uppercase hexadecimal sequences. From 50 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br> 51 * <i>"URI producers and normalizers should use uppercase hexadecimal digits 52 * for all percent-encodings."</i> 53 * 54 * @author David Beaumont 55 * @since 15.0 56 */ 57 @Beta 58 @GwtCompatible 59 public final class PercentEscaper extends UnicodeEscaper { 60 61 // In some escapers spaces are escaped to '+' 62 private static final char[] PLUS_SIGN = { '+' }; 63 64 // Percent escapers output upper case hex digits (uri escapers require this). 65 private static final char[] UPPER_HEX_DIGITS = 66 "0123456789ABCDEF".toCharArray(); 67 68 /** 69 * If true we should convert space to the {@code +} character. 70 */ 71 private final boolean plusForSpace; 72 73 /** 74 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is 75 * true then {@code c} should remain unmodified in the output. If 76 * {@code c > safeOctets.length} then it should be escaped. 77 */ 78 private final boolean[] safeOctets; 79 80 /** 81 * Constructs a percent escaper with the specified safe characters and 82 * optional handling of the space character. 83 * 84 * <p>Not that it is allowed, but not necessarily desirable to specify {@code %} 85 * as a safe character. This has the effect of creating an escaper which has no 86 * well defined inverse but it can be useful when escaping additional characters. 87 * 88 * @param safeChars a non null string specifying additional safe characters 89 * for this escaper (the ranges 0..9, a..z and A..Z are always safe and 90 * should not be specified here) 91 * @param plusForSpace true if ASCII space should be escaped to {@code +} 92 * rather than {@code %20} 93 * @throws IllegalArgumentException if any of the parameters were invalid 94 */ 95 public PercentEscaper(String safeChars, boolean plusForSpace) { 96 // TODO(user): Switch to static factory methods for creation now that class is final. 97 // TODO(user): Support escapers where alphanumeric chars are not safe. 98 checkNotNull(safeChars); // eager for GWT. 99 // Avoid any misunderstandings about the behavior of this escaper 100 if (safeChars.matches(".*[0-9A-Za-z].*")) { 101 throw new IllegalArgumentException( 102 "Alphanumeric characters are always 'safe' and should not be " + 103 "explicitly specified"); 104 } 105 safeChars += "abcdefghijklmnopqrstuvwxyz" + 106 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + 107 "0123456789"; 108 // Avoid ambiguous parameters. Safe characters are never modified so if 109 // space is a safe character then setting plusForSpace is meaningless. 110 if (plusForSpace && safeChars.contains(" ")) { 111 throw new IllegalArgumentException( 112 "plusForSpace cannot be specified when space is a 'safe' character"); 113 } 114 this.plusForSpace = plusForSpace; 115 this.safeOctets = createSafeOctets(safeChars); 116 } 117 118 /** 119 * Creates a boolean array with entries corresponding to the character values 120 * specified in safeChars set to true. The array is as small as is required to 121 * hold the given character information. 122 */ 123 private static boolean[] createSafeOctets(String safeChars) { 124 int maxChar = -1; 125 char[] safeCharArray = safeChars.toCharArray(); 126 for (char c : safeCharArray) { 127 maxChar = Math.max(c, maxChar); 128 } 129 boolean[] octets = new boolean[maxChar + 1]; 130 for (char c : safeCharArray) { 131 octets[c] = true; 132 } 133 return octets; 134 } 135 136 /* 137 * Overridden for performance. For unescaped strings this improved the 138 * performance of the uri escaper from ~760ns to ~400ns as measured by 139 * {@link CharEscapersBenchmark}. 140 */ 141 @Override 142 protected int nextEscapeIndex(CharSequence csq, int index, int end) { 143 checkNotNull(csq); 144 for (; index < end; index++) { 145 char c = csq.charAt(index); 146 if (c >= safeOctets.length || !safeOctets[c]) { 147 break; 148 } 149 } 150 return index; 151 } 152 153 /* 154 * Overridden for performance. For unescaped strings this improved the 155 * performance of the uri escaper from ~400ns to ~170ns as measured by 156 * {@link CharEscapersBenchmark}. 157 */ 158 @Override 159 public String escape(String s) { 160 checkNotNull(s); 161 int slen = s.length(); 162 for (int index = 0; index < slen; index++) { 163 char c = s.charAt(index); 164 if (c >= safeOctets.length || !safeOctets[c]) { 165 return escapeSlow(s, index); 166 } 167 } 168 return s; 169 } 170 171 /** 172 * Escapes the given Unicode code point in UTF-8. 173 */ 174 @Override 175 protected char[] escape(int cp) { 176 // We should never get negative values here but if we do it will throw an 177 // IndexOutOfBoundsException, so at least it will get spotted. 178 if (cp < safeOctets.length && safeOctets[cp]) { 179 return null; 180 } else if (cp == ' ' && plusForSpace) { 181 return PLUS_SIGN; 182 } else if (cp <= 0x7F) { 183 // Single byte UTF-8 characters 184 // Start with "%--" and fill in the blanks 185 char[] dest = new char[3]; 186 dest[0] = '%'; 187 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 188 dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; 189 return dest; 190 } else if (cp <= 0x7ff) { 191 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] 192 // Start with "%--%--" and fill in the blanks 193 char[] dest = new char[6]; 194 dest[0] = '%'; 195 dest[3] = '%'; 196 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 197 cp >>>= 4; 198 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 199 cp >>>= 2; 200 dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; 201 cp >>>= 4; 202 dest[1] = UPPER_HEX_DIGITS[0xC | cp]; 203 return dest; 204 } else if (cp <= 0xffff) { 205 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] 206 // Start with "%E-%--%--" and fill in the blanks 207 char[] dest = new char[9]; 208 dest[0] = '%'; 209 dest[1] = 'E'; 210 dest[3] = '%'; 211 dest[6] = '%'; 212 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 213 cp >>>= 4; 214 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 215 cp >>>= 2; 216 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 217 cp >>>= 4; 218 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 219 cp >>>= 2; 220 dest[2] = UPPER_HEX_DIGITS[cp]; 221 return dest; 222 } else if (cp <= 0x10ffff) { 223 char[] dest = new char[12]; 224 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] 225 // Start with "%F-%--%--%--" and fill in the blanks 226 dest[0] = '%'; 227 dest[1] = 'F'; 228 dest[3] = '%'; 229 dest[6] = '%'; 230 dest[9] = '%'; 231 dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; 232 cp >>>= 4; 233 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 234 cp >>>= 2; 235 dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; 236 cp >>>= 4; 237 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 238 cp >>>= 2; 239 dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; 240 cp >>>= 4; 241 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; 242 cp >>>= 2; 243 dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; 244 return dest; 245 } else { 246 // If this ever happens it is due to bug in UnicodeEscaper, not bad input. 247 throw new IllegalArgumentException( 248 "Invalid unicode character value " + cp); 249 } 250 } 251 }