[230] | 1 | # -*- coding: utf-8 -*-
|
---|
| 2 | # Licensed under the Apache License, Version 2.0 (the "License");
|
---|
| 3 | # you may not use this file except in compliance with the License.
|
---|
| 4 | # You may obtain a copy of the License at
|
---|
| 5 | #
|
---|
| 6 | # http://www.apache.org/licenses/LICENSE-2.0
|
---|
| 7 | #
|
---|
| 8 | # Unless required by applicable law or agreed to in writing, software
|
---|
| 9 | # distributed under the License is distributed on an "AS IS" BASIS,
|
---|
| 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
---|
| 11 | # implied.
|
---|
| 12 | # See the License for the specific language governing permissions and
|
---|
| 13 | # limitations under the License.
|
---|
| 14 | """Module for the regular expressions crafted from ABNF."""
|
---|
| 15 |
|
---|
| 16 | import sys
|
---|
| 17 |
|
---|
| 18 | # https://tools.ietf.org/html/rfc3986#page-13
|
---|
| 19 | GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
|
---|
| 20 | GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
|
---|
| 21 | # https://tools.ietf.org/html/rfc3986#page-13
|
---|
| 22 | SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
|
---|
| 23 | SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
|
---|
| 24 | # Escape the '*' for use in regular expressions
|
---|
| 25 | SUB_DELIMITERS_RE = r"!$&'()\*+,;="
|
---|
| 26 | RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
|
---|
| 27 | ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
---|
| 28 | DIGIT = "0123456789"
|
---|
| 29 | # https://tools.ietf.org/html/rfc3986#section-2.3
|
---|
| 30 | UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r"._!-~"
|
---|
| 31 | UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
|
---|
| 32 | NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET)
|
---|
| 33 | # We need to escape the '-' in this case:
|
---|
| 34 | UNRESERVED_RE = r"A-Za-z0-9._~\-"
|
---|
| 35 |
|
---|
| 36 | # Percent encoded character values
|
---|
| 37 | PERCENT_ENCODED = PCT_ENCODED = "%[A-Fa-f0-9]{2}"
|
---|
| 38 | PCHAR = "([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":@]|%s)" % PCT_ENCODED
|
---|
| 39 |
|
---|
| 40 | # NOTE(sigmavirus24): We're going to use more strict regular expressions
|
---|
| 41 | # than appear in Appendix B for scheme. This will prevent over-eager
|
---|
| 42 | # consuming of items that aren't schemes.
|
---|
| 43 | SCHEME_RE = "[a-zA-Z][a-zA-Z0-9+.-]*"
|
---|
| 44 | _AUTHORITY_RE = "[^\\\\/?#]*"
|
---|
| 45 | _PATH_RE = "[^?#]*"
|
---|
| 46 | _QUERY_RE = "[^#]*"
|
---|
| 47 | _FRAGMENT_RE = ".*"
|
---|
| 48 |
|
---|
| 49 | # Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
|
---|
| 50 | COMPONENT_PATTERN_DICT = {
|
---|
| 51 | "scheme": SCHEME_RE,
|
---|
| 52 | "authority": _AUTHORITY_RE,
|
---|
| 53 | "path": _PATH_RE,
|
---|
| 54 | "query": _QUERY_RE,
|
---|
| 55 | "fragment": _FRAGMENT_RE,
|
---|
| 56 | }
|
---|
| 57 |
|
---|
| 58 | # See http://tools.ietf.org/html/rfc3986#appendix-B
|
---|
| 59 | # In this case, we name each of the important matches so we can use
|
---|
| 60 | # SRE_Match#groupdict to parse the values out if we so choose. This is also
|
---|
| 61 | # modified to ignore other matches that are not important to the parsing of
|
---|
| 62 | # the reference so we can also simply use SRE_Match#groups.
|
---|
| 63 | URL_PARSING_RE = (
|
---|
| 64 | r"(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?"
|
---|
| 65 | r"(?P<path>{path})(?:\?(?P<query>{query}))?"
|
---|
| 66 | r"(?:#(?P<fragment>{fragment}))?"
|
---|
| 67 | ).format(**COMPONENT_PATTERN_DICT)
|
---|
| 68 |
|
---|
| 69 |
|
---|
| 70 | # #########################
|
---|
| 71 | # Authority Matcher Section
|
---|
| 72 | # #########################
|
---|
| 73 |
|
---|
| 74 | # Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
|
---|
| 75 | # The pattern for a regular name, e.g., www.google.com, api.github.com
|
---|
| 76 | REGULAR_NAME_RE = REG_NAME = "((?:{0}|[{1}])*)".format(
|
---|
| 77 | "%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + UNRESERVED_RE
|
---|
| 78 | )
|
---|
| 79 | # The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
|
---|
| 80 | IPv4_RE = r"([0-9]{1,3}\.){3}[0-9]{1,3}"
|
---|
| 81 | # Hexadecimal characters used in each piece of an IPv6 address
|
---|
| 82 | HEXDIG_RE = "[0-9A-Fa-f]{1,4}"
|
---|
| 83 | # Least-significant 32 bits of an IPv6 address
|
---|
| 84 | LS32_RE = "({hex}:{hex}|{ipv4})".format(hex=HEXDIG_RE, ipv4=IPv4_RE)
|
---|
| 85 | # Substitutions into the following patterns for IPv6 patterns defined
|
---|
| 86 | # http://tools.ietf.org/html/rfc3986#page-20
|
---|
| 87 | _subs = {"hex": HEXDIG_RE, "ls32": LS32_RE}
|
---|
| 88 |
|
---|
| 89 | # Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
|
---|
| 90 | # about ABNF (Augmented Backus-Naur Form) use in the comments
|
---|
| 91 | variations = [
|
---|
| 92 | # 6( h16 ":" ) ls32
|
---|
| 93 | "(%(hex)s:){6}%(ls32)s" % _subs,
|
---|
| 94 | # "::" 5( h16 ":" ) ls32
|
---|
| 95 | "::(%(hex)s:){5}%(ls32)s" % _subs,
|
---|
| 96 | # [ h16 ] "::" 4( h16 ":" ) ls32
|
---|
| 97 | "(%(hex)s)?::(%(hex)s:){4}%(ls32)s" % _subs,
|
---|
| 98 | # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
|
---|
| 99 | "((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s" % _subs,
|
---|
| 100 | # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
|
---|
| 101 | "((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s" % _subs,
|
---|
| 102 | # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
|
---|
| 103 | "((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s" % _subs,
|
---|
| 104 | # [ *4( h16 ":" ) h16 ] "::" ls32
|
---|
| 105 | "((%(hex)s:){0,4}%(hex)s)?::%(ls32)s" % _subs,
|
---|
| 106 | # [ *5( h16 ":" ) h16 ] "::" h16
|
---|
| 107 | "((%(hex)s:){0,5}%(hex)s)?::%(hex)s" % _subs,
|
---|
| 108 | # [ *6( h16 ":" ) h16 ] "::"
|
---|
| 109 | "((%(hex)s:){0,6}%(hex)s)?::" % _subs,
|
---|
| 110 | ]
|
---|
| 111 |
|
---|
| 112 | IPv6_RE = "(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))".format(
|
---|
| 113 | *variations
|
---|
| 114 | )
|
---|
| 115 |
|
---|
| 116 | IPv_FUTURE_RE = r"v[0-9A-Fa-f]+\.[%s]+" % (
|
---|
| 117 | UNRESERVED_RE + SUB_DELIMITERS_RE + ":"
|
---|
| 118 | )
|
---|
| 119 |
|
---|
| 120 | # RFC 6874 Zone ID ABNF
|
---|
| 121 | ZONE_ID = "(?:[" + UNRESERVED_RE + "]|" + PCT_ENCODED + ")+"
|
---|
| 122 |
|
---|
| 123 | IPv6_ADDRZ_RFC4007_RE = IPv6_RE + "(?:(?:%25|%)" + ZONE_ID + ")?"
|
---|
| 124 | IPv6_ADDRZ_RE = IPv6_RE + "(?:%25" + ZONE_ID + ")?"
|
---|
| 125 |
|
---|
| 126 | IP_LITERAL_RE = r"\[({0}|{1})\]".format(
|
---|
| 127 | IPv6_ADDRZ_RFC4007_RE,
|
---|
| 128 | IPv_FUTURE_RE,
|
---|
| 129 | )
|
---|
| 130 |
|
---|
| 131 | # Pattern for matching the host piece of the authority
|
---|
| 132 | HOST_RE = HOST_PATTERN = "({0}|{1}|{2})".format(
|
---|
| 133 | REG_NAME,
|
---|
| 134 | IPv4_RE,
|
---|
| 135 | IP_LITERAL_RE,
|
---|
| 136 | )
|
---|
| 137 | USERINFO_RE = (
|
---|
| 138 | "^([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":]|%s)+" % (PCT_ENCODED)
|
---|
| 139 | )
|
---|
| 140 | PORT_RE = "[0-9]{1,5}"
|
---|
| 141 |
|
---|
| 142 | # ####################
|
---|
| 143 | # Path Matcher Section
|
---|
| 144 | # ####################
|
---|
| 145 |
|
---|
| 146 | # See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
|
---|
| 147 | # about the path patterns defined below.
|
---|
| 148 | segments = {
|
---|
| 149 | "segment": PCHAR + "*",
|
---|
| 150 | # Non-zero length segment
|
---|
| 151 | "segment-nz": PCHAR + "+",
|
---|
| 152 | # Non-zero length segment without ":"
|
---|
| 153 | "segment-nz-nc": PCHAR.replace(":", "") + "+",
|
---|
| 154 | }
|
---|
| 155 |
|
---|
| 156 | # Path types taken from Section 3.3 (linked above)
|
---|
| 157 | PATH_EMPTY = "^$"
|
---|
| 158 | PATH_ROOTLESS = "%(segment-nz)s(/%(segment)s)*" % segments
|
---|
| 159 | PATH_NOSCHEME = "%(segment-nz-nc)s(/%(segment)s)*" % segments
|
---|
| 160 | PATH_ABSOLUTE = "/(%s)?" % PATH_ROOTLESS
|
---|
| 161 | PATH_ABEMPTY = "(/%(segment)s)*" % segments
|
---|
| 162 | PATH_RE = "^(%s|%s|%s|%s|%s)$" % (
|
---|
| 163 | PATH_ABEMPTY,
|
---|
| 164 | PATH_ABSOLUTE,
|
---|
| 165 | PATH_NOSCHEME,
|
---|
| 166 | PATH_ROOTLESS,
|
---|
| 167 | PATH_EMPTY,
|
---|
| 168 | )
|
---|
| 169 |
|
---|
| 170 | FRAGMENT_RE = QUERY_RE = (
|
---|
| 171 | "^([/?:@" + UNRESERVED_RE + SUB_DELIMITERS_RE + "]|%s)*$" % PCT_ENCODED
|
---|
| 172 | )
|
---|
| 173 |
|
---|
| 174 | # ##########################
|
---|
| 175 | # Relative reference matcher
|
---|
| 176 | # ##########################
|
---|
| 177 |
|
---|
| 178 | # See http://tools.ietf.org/html/rfc3986#section-4.2 for details
|
---|
| 179 | RELATIVE_PART_RE = "(//%s%s|%s|%s|%s)" % (
|
---|
| 180 | COMPONENT_PATTERN_DICT["authority"],
|
---|
| 181 | PATH_ABEMPTY,
|
---|
| 182 | PATH_ABSOLUTE,
|
---|
| 183 | PATH_NOSCHEME,
|
---|
| 184 | PATH_EMPTY,
|
---|
| 185 | )
|
---|
| 186 |
|
---|
| 187 | # See http://tools.ietf.org/html/rfc3986#section-3 for definition
|
---|
| 188 | HIER_PART_RE = "(//%s%s|%s|%s|%s)" % (
|
---|
| 189 | COMPONENT_PATTERN_DICT["authority"],
|
---|
| 190 | PATH_ABEMPTY,
|
---|
| 191 | PATH_ABSOLUTE,
|
---|
| 192 | PATH_ROOTLESS,
|
---|
| 193 | PATH_EMPTY,
|
---|
| 194 | )
|
---|
| 195 |
|
---|
| 196 | # ###############
|
---|
| 197 | # IRIs / RFC 3987
|
---|
| 198 | # ###############
|
---|
| 199 |
|
---|
| 200 | # Only wide-unicode gets the high-ranges of UCSCHAR
|
---|
| 201 | if sys.maxunicode > 0xFFFF: # pragma: no cover
|
---|
| 202 | IPRIVATE = u"\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD"
|
---|
| 203 | UCSCHAR_RE = (
|
---|
| 204 | u"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"
|
---|
| 205 | u"\U00010000-\U0001FFFD\U00020000-\U0002FFFD"
|
---|
| 206 | u"\U00030000-\U0003FFFD\U00040000-\U0004FFFD"
|
---|
| 207 | u"\U00050000-\U0005FFFD\U00060000-\U0006FFFD"
|
---|
| 208 | u"\U00070000-\U0007FFFD\U00080000-\U0008FFFD"
|
---|
| 209 | u"\U00090000-\U0009FFFD\U000A0000-\U000AFFFD"
|
---|
| 210 | u"\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD"
|
---|
| 211 | u"\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD"
|
---|
| 212 | )
|
---|
| 213 | else: # pragma: no cover
|
---|
| 214 | IPRIVATE = u"\uE000-\uF8FF"
|
---|
| 215 | UCSCHAR_RE = u"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"
|
---|
| 216 |
|
---|
| 217 | IUNRESERVED_RE = u"A-Za-z0-9\\._~\\-" + UCSCHAR_RE
|
---|
| 218 | IPCHAR = u"([" + IUNRESERVED_RE + SUB_DELIMITERS_RE + u":@]|%s)" % PCT_ENCODED
|
---|
| 219 |
|
---|
| 220 | isegments = {
|
---|
| 221 | "isegment": IPCHAR + u"*",
|
---|
| 222 | # Non-zero length segment
|
---|
| 223 | "isegment-nz": IPCHAR + u"+",
|
---|
| 224 | # Non-zero length segment without ":"
|
---|
| 225 | "isegment-nz-nc": IPCHAR.replace(":", "") + u"+",
|
---|
| 226 | }
|
---|
| 227 |
|
---|
| 228 | IPATH_ROOTLESS = u"%(isegment-nz)s(/%(isegment)s)*" % isegments
|
---|
| 229 | IPATH_NOSCHEME = u"%(isegment-nz-nc)s(/%(isegment)s)*" % isegments
|
---|
| 230 | IPATH_ABSOLUTE = u"/(?:%s)?" % IPATH_ROOTLESS
|
---|
| 231 | IPATH_ABEMPTY = u"(?:/%(isegment)s)*" % isegments
|
---|
| 232 | IPATH_RE = u"^(?:%s|%s|%s|%s|%s)$" % (
|
---|
| 233 | IPATH_ABEMPTY,
|
---|
| 234 | IPATH_ABSOLUTE,
|
---|
| 235 | IPATH_NOSCHEME,
|
---|
| 236 | IPATH_ROOTLESS,
|
---|
| 237 | PATH_EMPTY,
|
---|
| 238 | )
|
---|
| 239 |
|
---|
| 240 | IREGULAR_NAME_RE = IREG_NAME = u"(?:{0}|[{1}])*".format(
|
---|
| 241 | u"%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + IUNRESERVED_RE
|
---|
| 242 | )
|
---|
| 243 |
|
---|
| 244 | IHOST_RE = IHOST_PATTERN = u"({0}|{1}|{2})".format(
|
---|
| 245 | IREG_NAME,
|
---|
| 246 | IPv4_RE,
|
---|
| 247 | IP_LITERAL_RE,
|
---|
| 248 | )
|
---|
| 249 |
|
---|
| 250 | IUSERINFO_RE = (
|
---|
| 251 | u"^(?:[" + IUNRESERVED_RE + SUB_DELIMITERS_RE + u":]|%s)+" % (PCT_ENCODED)
|
---|
| 252 | )
|
---|
| 253 |
|
---|
| 254 | IFRAGMENT_RE = (
|
---|
| 255 | u"^(?:[/?:@"
|
---|
| 256 | + IUNRESERVED_RE
|
---|
| 257 | + SUB_DELIMITERS_RE
|
---|
| 258 | + u"]|%s)*$" % PCT_ENCODED
|
---|
| 259 | )
|
---|
| 260 | IQUERY_RE = (
|
---|
| 261 | u"^(?:[/?:@"
|
---|
| 262 | + IUNRESERVED_RE
|
---|
| 263 | + SUB_DELIMITERS_RE
|
---|
| 264 | + IPRIVATE
|
---|
| 265 | + u"]|%s)*$" % PCT_ENCODED
|
---|
| 266 | )
|
---|
| 267 |
|
---|
| 268 | IRELATIVE_PART_RE = u"(//%s%s|%s|%s|%s)" % (
|
---|
| 269 | COMPONENT_PATTERN_DICT["authority"],
|
---|
| 270 | IPATH_ABEMPTY,
|
---|
| 271 | IPATH_ABSOLUTE,
|
---|
| 272 | IPATH_NOSCHEME,
|
---|
| 273 | PATH_EMPTY,
|
---|
| 274 | )
|
---|
| 275 |
|
---|
| 276 | IHIER_PART_RE = u"(//%s%s|%s|%s|%s)" % (
|
---|
| 277 | COMPONENT_PATTERN_DICT["authority"],
|
---|
| 278 | IPATH_ABEMPTY,
|
---|
| 279 | IPATH_ABSOLUTE,
|
---|
| 280 | IPATH_ROOTLESS,
|
---|
| 281 | PATH_EMPTY,
|
---|
| 282 | )
|
---|