View Javadoc
1   /*
2    * Copyright (C) 2009 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    * http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package com.google.common.net;
18  
19  import com.google.common.annotations.Beta;
20  import com.google.common.annotations.GwtCompatible;
21  import com.google.common.escape.Escaper;
22  
23  /**
24   * {@code Escaper} instances suitable for strings to be included in particular
25   * sections of URLs.
26   *
27   * <p>If the resulting URLs are inserted into an HTML or XML document, they will
28   * require additional escaping with {@link com.google.common.html.HtmlEscapers}
29   * or {@link com.google.common.xml.XmlEscapers}.
30   *
31   *
32   * @author David Beaumont
33   * @author Chris Povirk
34   * @since 15.0
35   */
36  @Beta
37  @GwtCompatible
38  public final class UrlEscapers {
39    private UrlEscapers() {}
40  
41    // For each xxxEscaper() method, please add links to external reference pages
42    // that are considered authoritative for the behavior of that escaper.
43  
44    static final String URL_FORM_PARAMETER_OTHER_SAFE_CHARS = "-_.*";
45  
46    static final String URL_PATH_OTHER_SAFE_CHARS_LACKING_PLUS =
47        "-._~" +        // Unreserved characters.
48        "!$'()*,;&=" +  // The subdelim characters (excluding '+').
49        "@:";           // The gendelim characters permitted in paths.
50  
51    /**
52     * Returns an {@link Escaper} instance that escapes strings so they can be
53     * safely included in <a href="http://goo.gl/OQEc8">URL form parameter names
54     * and values</a>. Escaping is performed with the UTF-8 character encoding.
55     * The caller is responsible for <a href="http://goo.gl/i20ms">replacing any
56     * unpaired carriage return or line feed characters with a CR+LF pair</a> on
57     * any non-file inputs before escaping them with this escaper.
58     *
59     * <p>When escaping a String, the following rules apply:
60     * <ul>
61     * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
62     *     through "9" remain the same.
63     * <li>The special characters ".", "-", "*", and "_" remain the same.
64     * <li>The space character " " is converted into a plus sign "+".
65     * <li>All other characters are converted into one or more bytes using UTF-8
66     *     encoding and each byte is then represented by the 3-character string
67     *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
68     *     representation of the byte value.
69     * </ul>
70     *
71     * <p>This escaper is suitable for escaping parameter names and values even
72     * when <a href="http://goo.gl/utn6M">using the non-standard semicolon</a>,
73     * rather than the ampersand, as a parameter delimiter. Nevertheless, we
74     * recommend using the ampersand unless you must interoperate with systems
75     * that require semicolons.
76     *
77     * <p><b>Note:</b> Unlike other escapers, URL escapers produce uppercase
78     * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
79     * RFC 3986</a>:<br>
80     * <i>"URI producers and normalizers should use uppercase hexadecimal digits
81     * for all percent-encodings."</i>
82     *
83     */
84    public static Escaper urlFormParameterEscaper() {
85      return URL_FORM_PARAMETER_ESCAPER;
86    }
87  
88    private static final Escaper URL_FORM_PARAMETER_ESCAPER =
89        new PercentEscaper(URL_FORM_PARAMETER_OTHER_SAFE_CHARS, true);
90  
91    /**
92     * Returns an {@link Escaper} instance that escapes strings so they can be
93     * safely included in <a href="http://goo.gl/swjbR">URL path segments</a>. The
94     * returned escaper escapes all non-ASCII characters, even though <a
95     * href="http://goo.gl/xIJWe">many of these are accepted in modern URLs</a>.
96     * (<a href="http://goo.gl/WMGvZ">If the escaper were to leave these
97     * characters unescaped, they would be escaped by the consumer at parse time,
98     * anyway.</a>) Additionally, the escaper escapes the slash character ("/").
99     * While slashes are acceptable in URL paths, they are considered by the
100    * specification to be separators between "path segments." This implies that,
101    * if you wish for your path to contain slashes, you must escape each segment
102    * separately and then join them.
103    *
104    * <p>When escaping a String, the following rules apply:
105    * <ul>
106    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
107    *     through "9" remain the same.
108    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
109    * <li>The general delimiters "@" and ":" remain the same.
110    * <li>The subdelimiters "!", "$", "&amp;", "'", "(", ")", "*", "+", ",", ";",
111    *     and "=" remain the same.
112    * <li>The space character " " is converted into %20.
113    * <li>All other characters are converted into one or more bytes using UTF-8
114    *     encoding and each byte is then represented by the 3-character string
115    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
116    *     representation of the byte value.
117    * </ul>
118    *
119    * <p><b>Note:</b> Unlike other escapers, URL escapers produce uppercase
120    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
121    * RFC 3986</a>:<br>
122    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
123    * for all percent-encodings."</i>
124    */
125   public static Escaper urlPathSegmentEscaper() {
126     return URL_PATH_SEGMENT_ESCAPER;
127   }
128 
129   private static final Escaper URL_PATH_SEGMENT_ESCAPER =
130       new PercentEscaper(URL_PATH_OTHER_SAFE_CHARS_LACKING_PLUS + "+", false);
131 
132   /**
133    * Returns an {@link Escaper} instance that escapes strings so they can be
134    * safely included in a <a href="http://goo.gl/xXEq4p">URL fragment</a>. The
135    * returned escaper escapes all non-ASCII characters, even though <a
136    * href="http://goo.gl/xIJWe">many of these are accepted in modern URLs</a>.
137    * (<a href="http://goo.gl/WMGvZ">If the escaper were to leave these
138    * characters unescaped, they would be escaped by the consumer at parse time,
139    * anyway.</a>)
140    *
141    * <p>When escaping a String, the following rules apply:
142    * <ul>
143    * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0"
144    *     through "9" remain the same.
145    * <li>The unreserved characters ".", "-", "~", and "_" remain the same.
146    * <li>The general delimiters "@" and ":" remain the same.
147    * <li>The subdelimiters "!", "$", "&amp;", "'", "(", ")", "*", "+", ",", ";",
148    *     and "=" remain the same.
149    * <li>The space character " " is converted into %20.
150    * <li>Fragments allow unescaped "/" and "?", so they remain the same.
151    * <li>All other characters are converted into one or more bytes using UTF-8
152    *     encoding and each byte is then represented by the 3-character string
153    *     "%XY", where "XY" is the two-digit, uppercase, hexadecimal
154    *     representation of the byte value.
155    * </ul>
156    *
157    * <p><b>Note:</b> Unlike other escapers, URL escapers produce uppercase
158    * hexadecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt">
159    * RFC 3986</a>:<br>
160    * <i>"URI producers and normalizers should use uppercase hexadecimal digits
161    * for all percent-encodings."</i>
162    */
163   public static Escaper urlFragmentEscaper() {
164     return URL_FRAGMENT_ESCAPER;
165   }
166 
167   private static final Escaper URL_FRAGMENT_ESCAPER =
168       new PercentEscaper(URL_PATH_OTHER_SAFE_CHARS_LACKING_PLUS + "+/?", false);
169 }