source: uri/rfc3986/abnf_regexp.py@ 1478

Last change on this file since 1478 was 230, checked in by wouter, 4 years ago

#91 clone https://pypi.org/project/rfc3986/

File size: 8.9 KB
Line 
1# -*- coding: utf-8 -*-
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7#
8# Unless required by applicable law or agreed to in writing, software
9# distributed under the License is distributed on an "AS IS" BASIS,
10# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
11# implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""Module for the regular expressions crafted from ABNF."""
15
16import sys
17
18# https://tools.ietf.org/html/rfc3986#page-13
19GEN_DELIMS = GENERIC_DELIMITERS = ":/?#[]@"
20GENERIC_DELIMITERS_SET = set(GENERIC_DELIMITERS)
21# https://tools.ietf.org/html/rfc3986#page-13
22SUB_DELIMS = SUB_DELIMITERS = "!$&'()*+,;="
23SUB_DELIMITERS_SET = set(SUB_DELIMITERS)
24# Escape the '*' for use in regular expressions
25SUB_DELIMITERS_RE = r"!$&'()\*+,;="
26RESERVED_CHARS_SET = GENERIC_DELIMITERS_SET.union(SUB_DELIMITERS_SET)
27ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
28DIGIT = "0123456789"
29# https://tools.ietf.org/html/rfc3986#section-2.3
30UNRESERVED = UNRESERVED_CHARS = ALPHA + DIGIT + r"._!-~"
31UNRESERVED_CHARS_SET = set(UNRESERVED_CHARS)
32NON_PCT_ENCODED_SET = RESERVED_CHARS_SET.union(UNRESERVED_CHARS_SET)
33# We need to escape the '-' in this case:
34UNRESERVED_RE = r"A-Za-z0-9._~\-"
35
36# Percent encoded character values
37PERCENT_ENCODED = PCT_ENCODED = "%[A-Fa-f0-9]{2}"
38PCHAR = "([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":@]|%s)" % PCT_ENCODED
39
40# NOTE(sigmavirus24): We're going to use more strict regular expressions
41# than appear in Appendix B for scheme. This will prevent over-eager
42# consuming of items that aren't schemes.
43SCHEME_RE = "[a-zA-Z][a-zA-Z0-9+.-]*"
44_AUTHORITY_RE = "[^\\\\/?#]*"
45_PATH_RE = "[^?#]*"
46_QUERY_RE = "[^#]*"
47_FRAGMENT_RE = ".*"
48
49# Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
50COMPONENT_PATTERN_DICT = {
51 "scheme": SCHEME_RE,
52 "authority": _AUTHORITY_RE,
53 "path": _PATH_RE,
54 "query": _QUERY_RE,
55 "fragment": _FRAGMENT_RE,
56}
57
58# See http://tools.ietf.org/html/rfc3986#appendix-B
59# In this case, we name each of the important matches so we can use
60# SRE_Match#groupdict to parse the values out if we so choose. This is also
61# modified to ignore other matches that are not important to the parsing of
62# the reference so we can also simply use SRE_Match#groups.
63URL_PARSING_RE = (
64 r"(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?"
65 r"(?P<path>{path})(?:\?(?P<query>{query}))?"
66 r"(?:#(?P<fragment>{fragment}))?"
67).format(**COMPONENT_PATTERN_DICT)
68
69
70# #########################
71# Authority Matcher Section
72# #########################
73
74# Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
75# The pattern for a regular name, e.g., www.google.com, api.github.com
76REGULAR_NAME_RE = REG_NAME = "((?:{0}|[{1}])*)".format(
77 "%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + UNRESERVED_RE
78)
79# The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
80IPv4_RE = r"([0-9]{1,3}\.){3}[0-9]{1,3}"
81# Hexadecimal characters used in each piece of an IPv6 address
82HEXDIG_RE = "[0-9A-Fa-f]{1,4}"
83# Least-significant 32 bits of an IPv6 address
84LS32_RE = "({hex}:{hex}|{ipv4})".format(hex=HEXDIG_RE, ipv4=IPv4_RE)
85# Substitutions into the following patterns for IPv6 patterns defined
86# http://tools.ietf.org/html/rfc3986#page-20
87_subs = {"hex": HEXDIG_RE, "ls32": LS32_RE}
88
89# Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
90# about ABNF (Augmented Backus-Naur Form) use in the comments
91variations = [
92 # 6( h16 ":" ) ls32
93 "(%(hex)s:){6}%(ls32)s" % _subs,
94 # "::" 5( h16 ":" ) ls32
95 "::(%(hex)s:){5}%(ls32)s" % _subs,
96 # [ h16 ] "::" 4( h16 ":" ) ls32
97 "(%(hex)s)?::(%(hex)s:){4}%(ls32)s" % _subs,
98 # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
99 "((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s" % _subs,
100 # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
101 "((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s" % _subs,
102 # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
103 "((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s" % _subs,
104 # [ *4( h16 ":" ) h16 ] "::" ls32
105 "((%(hex)s:){0,4}%(hex)s)?::%(ls32)s" % _subs,
106 # [ *5( h16 ":" ) h16 ] "::" h16
107 "((%(hex)s:){0,5}%(hex)s)?::%(hex)s" % _subs,
108 # [ *6( h16 ":" ) h16 ] "::"
109 "((%(hex)s:){0,6}%(hex)s)?::" % _subs,
110]
111
112IPv6_RE = "(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7})|({8}))".format(
113 *variations
114)
115
116IPv_FUTURE_RE = r"v[0-9A-Fa-f]+\.[%s]+" % (
117 UNRESERVED_RE + SUB_DELIMITERS_RE + ":"
118)
119
120# RFC 6874 Zone ID ABNF
121ZONE_ID = "(?:[" + UNRESERVED_RE + "]|" + PCT_ENCODED + ")+"
122
123IPv6_ADDRZ_RFC4007_RE = IPv6_RE + "(?:(?:%25|%)" + ZONE_ID + ")?"
124IPv6_ADDRZ_RE = IPv6_RE + "(?:%25" + ZONE_ID + ")?"
125
126IP_LITERAL_RE = r"\[({0}|{1})\]".format(
127 IPv6_ADDRZ_RFC4007_RE,
128 IPv_FUTURE_RE,
129)
130
131# Pattern for matching the host piece of the authority
132HOST_RE = HOST_PATTERN = "({0}|{1}|{2})".format(
133 REG_NAME,
134 IPv4_RE,
135 IP_LITERAL_RE,
136)
137USERINFO_RE = (
138 "^([" + UNRESERVED_RE + SUB_DELIMITERS_RE + ":]|%s)+" % (PCT_ENCODED)
139)
140PORT_RE = "[0-9]{1,5}"
141
142# ####################
143# Path Matcher Section
144# ####################
145
146# See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
147# about the path patterns defined below.
148segments = {
149 "segment": PCHAR + "*",
150 # Non-zero length segment
151 "segment-nz": PCHAR + "+",
152 # Non-zero length segment without ":"
153 "segment-nz-nc": PCHAR.replace(":", "") + "+",
154}
155
156# Path types taken from Section 3.3 (linked above)
157PATH_EMPTY = "^$"
158PATH_ROOTLESS = "%(segment-nz)s(/%(segment)s)*" % segments
159PATH_NOSCHEME = "%(segment-nz-nc)s(/%(segment)s)*" % segments
160PATH_ABSOLUTE = "/(%s)?" % PATH_ROOTLESS
161PATH_ABEMPTY = "(/%(segment)s)*" % segments
162PATH_RE = "^(%s|%s|%s|%s|%s)$" % (
163 PATH_ABEMPTY,
164 PATH_ABSOLUTE,
165 PATH_NOSCHEME,
166 PATH_ROOTLESS,
167 PATH_EMPTY,
168)
169
170FRAGMENT_RE = QUERY_RE = (
171 "^([/?:@" + UNRESERVED_RE + SUB_DELIMITERS_RE + "]|%s)*$" % PCT_ENCODED
172)
173
174# ##########################
175# Relative reference matcher
176# ##########################
177
178# See http://tools.ietf.org/html/rfc3986#section-4.2 for details
179RELATIVE_PART_RE = "(//%s%s|%s|%s|%s)" % (
180 COMPONENT_PATTERN_DICT["authority"],
181 PATH_ABEMPTY,
182 PATH_ABSOLUTE,
183 PATH_NOSCHEME,
184 PATH_EMPTY,
185)
186
187# See http://tools.ietf.org/html/rfc3986#section-3 for definition
188HIER_PART_RE = "(//%s%s|%s|%s|%s)" % (
189 COMPONENT_PATTERN_DICT["authority"],
190 PATH_ABEMPTY,
191 PATH_ABSOLUTE,
192 PATH_ROOTLESS,
193 PATH_EMPTY,
194)
195
196# ###############
197# IRIs / RFC 3987
198# ###############
199
200# Only wide-unicode gets the high-ranges of UCSCHAR
201if sys.maxunicode > 0xFFFF: # pragma: no cover
202 IPRIVATE = u"\uE000-\uF8FF\U000F0000-\U000FFFFD\U00100000-\U0010FFFD"
203 UCSCHAR_RE = (
204 u"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"
205 u"\U00010000-\U0001FFFD\U00020000-\U0002FFFD"
206 u"\U00030000-\U0003FFFD\U00040000-\U0004FFFD"
207 u"\U00050000-\U0005FFFD\U00060000-\U0006FFFD"
208 u"\U00070000-\U0007FFFD\U00080000-\U0008FFFD"
209 u"\U00090000-\U0009FFFD\U000A0000-\U000AFFFD"
210 u"\U000B0000-\U000BFFFD\U000C0000-\U000CFFFD"
211 u"\U000D0000-\U000DFFFD\U000E1000-\U000EFFFD"
212 )
213else: # pragma: no cover
214 IPRIVATE = u"\uE000-\uF8FF"
215 UCSCHAR_RE = u"\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF"
216
217IUNRESERVED_RE = u"A-Za-z0-9\\._~\\-" + UCSCHAR_RE
218IPCHAR = u"([" + IUNRESERVED_RE + SUB_DELIMITERS_RE + u":@]|%s)" % PCT_ENCODED
219
220isegments = {
221 "isegment": IPCHAR + u"*",
222 # Non-zero length segment
223 "isegment-nz": IPCHAR + u"+",
224 # Non-zero length segment without ":"
225 "isegment-nz-nc": IPCHAR.replace(":", "") + u"+",
226}
227
228IPATH_ROOTLESS = u"%(isegment-nz)s(/%(isegment)s)*" % isegments
229IPATH_NOSCHEME = u"%(isegment-nz-nc)s(/%(isegment)s)*" % isegments
230IPATH_ABSOLUTE = u"/(?:%s)?" % IPATH_ROOTLESS
231IPATH_ABEMPTY = u"(?:/%(isegment)s)*" % isegments
232IPATH_RE = u"^(?:%s|%s|%s|%s|%s)$" % (
233 IPATH_ABEMPTY,
234 IPATH_ABSOLUTE,
235 IPATH_NOSCHEME,
236 IPATH_ROOTLESS,
237 PATH_EMPTY,
238)
239
240IREGULAR_NAME_RE = IREG_NAME = u"(?:{0}|[{1}])*".format(
241 u"%[0-9A-Fa-f]{2}", SUB_DELIMITERS_RE + IUNRESERVED_RE
242)
243
244IHOST_RE = IHOST_PATTERN = u"({0}|{1}|{2})".format(
245 IREG_NAME,
246 IPv4_RE,
247 IP_LITERAL_RE,
248)
249
250IUSERINFO_RE = (
251 u"^(?:[" + IUNRESERVED_RE + SUB_DELIMITERS_RE + u":]|%s)+" % (PCT_ENCODED)
252)
253
254IFRAGMENT_RE = (
255 u"^(?:[/?:@"
256 + IUNRESERVED_RE
257 + SUB_DELIMITERS_RE
258 + u"]|%s)*$" % PCT_ENCODED
259)
260IQUERY_RE = (
261 u"^(?:[/?:@"
262 + IUNRESERVED_RE
263 + SUB_DELIMITERS_RE
264 + IPRIVATE
265 + u"]|%s)*$" % PCT_ENCODED
266)
267
268IRELATIVE_PART_RE = u"(//%s%s|%s|%s|%s)" % (
269 COMPONENT_PATTERN_DICT["authority"],
270 IPATH_ABEMPTY,
271 IPATH_ABSOLUTE,
272 IPATH_NOSCHEME,
273 PATH_EMPTY,
274)
275
276IHIER_PART_RE = u"(//%s%s|%s|%s|%s)" % (
277 COMPONENT_PATTERN_DICT["authority"],
278 IPATH_ABEMPTY,
279 IPATH_ABSOLUTE,
280 IPATH_ROOTLESS,
281 PATH_EMPTY,
282)
Note: See TracBrowser for help on using the repository browser.