source: uri/rfc3986/normalizers.py@ 742

Last change on this file since 742 was 230, checked in by wouter, 4 years ago

#91 clone https://pypi.org/project/rfc3986/

File size: 5.2 KB
Line 
1# -*- coding: utf-8 -*-
2# Copyright (c) 2014 Rackspace
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12# implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15"""Module with functions to normalize components."""
16import re
17
18from . import compat
19from . import misc
20
21
22def normalize_scheme(scheme):
23 """Normalize the scheme component."""
24 return scheme.lower()
25
26
27def normalize_authority(authority):
28 """Normalize an authority tuple to a string."""
29 userinfo, host, port = authority
30 result = ""
31 if userinfo:
32 result += normalize_percent_characters(userinfo) + "@"
33 if host:
34 result += normalize_host(host)
35 if port:
36 result += ":" + port
37 return result
38
39
40def normalize_username(username):
41 """Normalize a username to make it safe to include in userinfo."""
42 return compat.urlquote(username)
43
44
45def normalize_password(password):
46 """Normalize a password to make safe for userinfo."""
47 return compat.urlquote(password)
48
49
50def normalize_host(host):
51 """Normalize a host string."""
52 if misc.IPv6_MATCHER.match(host):
53 percent = host.find("%")
54 if percent != -1:
55 percent_25 = host.find("%25")
56
57 # Replace RFC 4007 IPv6 Zone ID delimiter '%' with '%25'
58 # from RFC 6874. If the host is '[<IPv6 addr>%25]' then we
59 # assume RFC 4007 and normalize to '[<IPV6 addr>%2525]'
60 if (
61 percent_25 == -1
62 or percent < percent_25
63 or (percent == percent_25 and percent_25 == len(host) - 4)
64 ):
65 host = host.replace("%", "%25", 1)
66
67 # Don't normalize the casing of the Zone ID
68 return host[:percent].lower() + host[percent:]
69
70 return host.lower()
71
72
73def normalize_path(path):
74 """Normalize the path string."""
75 if not path:
76 return path
77
78 path = normalize_percent_characters(path)
79 return remove_dot_segments(path)
80
81
82def normalize_query(query):
83 """Normalize the query string."""
84 if not query:
85 return query
86 return normalize_percent_characters(query)
87
88
89def normalize_fragment(fragment):
90 """Normalize the fragment string."""
91 if not fragment:
92 return fragment
93 return normalize_percent_characters(fragment)
94
95
96PERCENT_MATCHER = re.compile("%[A-Fa-f0-9]{2}")
97
98
99def normalize_percent_characters(s):
100 """All percent characters should be upper-cased.
101
102 For example, ``"%3afoo%DF%ab"`` should be turned into ``"%3Afoo%DF%AB"``.
103 """
104 matches = set(PERCENT_MATCHER.findall(s))
105 for m in matches:
106 if not m.isupper():
107 s = s.replace(m, m.upper())
108 return s
109
110
111def remove_dot_segments(s):
112 """Remove dot segments from the string.
113
114 See also Section 5.2.4 of :rfc:`3986`.
115 """
116 # See http://tools.ietf.org/html/rfc3986#section-5.2.4 for pseudo-code
117 segments = s.split("/") # Turn the path into a list of segments
118 output = [] # Initialize the variable to use to store output
119
120 for segment in segments:
121 # '.' is the current directory, so ignore it, it is superfluous
122 if segment == ".":
123 continue
124 # Anything other than '..', should be appended to the output
125 elif segment != "..":
126 output.append(segment)
127 # In this case segment == '..', if we can, we should pop the last
128 # element
129 elif output:
130 output.pop()
131
132 # If the path starts with '/' and the output is empty or the first string
133 # is non-empty
134 if s.startswith("/") and (not output or output[0]):
135 output.insert(0, "")
136
137 # If the path starts with '/.' or '/..' ensure we add one more empty
138 # string to add a trailing '/'
139 if s.endswith(("/.", "/..")):
140 output.append("")
141
142 return "/".join(output)
143
144
145def encode_component(uri_component, encoding):
146 """Encode the specific component in the provided encoding."""
147 if uri_component is None:
148 return uri_component
149
150 # Try to see if the component we're encoding is already percent-encoded
151 # so we can skip all '%' characters but still encode all others.
152 percent_encodings = len(
153 PERCENT_MATCHER.findall(compat.to_str(uri_component, encoding))
154 )
155
156 uri_bytes = compat.to_bytes(uri_component, encoding)
157 is_percent_encoded = percent_encodings == uri_bytes.count(b"%")
158
159 encoded_uri = bytearray()
160
161 for i in range(0, len(uri_bytes)):
162 # Will return a single character bytestring on both Python 2 & 3
163 byte = uri_bytes[i : i + 1]
164 byte_ord = ord(byte)
165 if (is_percent_encoded and byte == b"%") or (
166 byte_ord < 128 and byte.decode() in misc.NON_PCT_ENCODED
167 ):
168 encoded_uri.extend(byte)
169 continue
170 encoded_uri.extend("%{0:02x}".format(byte_ord).encode().upper())
171
172 return encoded_uri.decode(encoding)
Note: See TracBrowser for help on using the repository browser.