1 | """ |
---|
2 | Functions used to convert inputs from whatever encoding used in the system to |
---|
3 | unicode and back. |
---|
4 | |
---|
5 | Ported to Python 3. |
---|
6 | |
---|
7 | Once Python 2 support is dropped, most of this module will obsolete, since |
---|
8 | Unicode is the default everywhere in Python 3. |
---|
9 | """ |
---|
10 | |
---|
11 | from six import ensure_str |
---|
12 | |
---|
13 | import sys, os, re |
---|
14 | import unicodedata |
---|
15 | import warnings |
---|
16 | |
---|
17 | from allmydata.util.assertutil import precondition, _assert |
---|
18 | from twisted.python import usage |
---|
19 | from twisted.python.filepath import FilePath |
---|
20 | from allmydata.util import log |
---|
21 | from allmydata.util.fileutil import abspath_expanduser_unicode |
---|
22 | |
---|
23 | NoneType = type(None) |
---|
24 | |
---|
25 | |
---|
26 | def canonical_encoding(encoding): |
---|
27 | if encoding is None: |
---|
28 | log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD) |
---|
29 | encoding = 'utf-8' |
---|
30 | encoding = encoding.lower() |
---|
31 | if encoding == "cp65001": |
---|
32 | encoding = 'utf-8' |
---|
33 | elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968": |
---|
34 | encoding = 'ascii' |
---|
35 | |
---|
36 | return encoding |
---|
37 | |
---|
38 | def check_encoding(encoding): |
---|
39 | # sometimes Python returns an encoding name that it doesn't support for conversion |
---|
40 | # fail early if this happens |
---|
41 | try: |
---|
42 | u"test".encode(encoding) |
---|
43 | except (LookupError, AttributeError): |
---|
44 | raise AssertionError( |
---|
45 | "The character encoding '%s' is not supported for conversion." % (encoding,), |
---|
46 | ) |
---|
47 | |
---|
48 | # On Windows we install UTF-8 stream wrappers for sys.stdout and |
---|
49 | # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py). |
---|
50 | # |
---|
51 | # On POSIX, we are moving towards a UTF-8-everything and ignore the locale. |
---|
52 | io_encoding = "utf-8" |
---|
53 | |
---|
54 | filesystem_encoding = None |
---|
55 | |
---|
56 | def _reload(): |
---|
57 | global filesystem_encoding |
---|
58 | filesystem_encoding = canonical_encoding(sys.getfilesystemencoding()) |
---|
59 | check_encoding(filesystem_encoding) |
---|
60 | |
---|
61 | _reload() |
---|
62 | |
---|
63 | |
---|
64 | def get_filesystem_encoding(): |
---|
65 | """ |
---|
66 | Returns expected encoding for local filenames. |
---|
67 | """ |
---|
68 | return filesystem_encoding |
---|
69 | |
---|
70 | def get_io_encoding(): |
---|
71 | """ |
---|
72 | Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv. |
---|
73 | """ |
---|
74 | return io_encoding |
---|
75 | |
---|
76 | def argv_to_unicode(s): |
---|
77 | """ |
---|
78 | Decode given argv element to unicode. If this fails, raise a UsageError. |
---|
79 | |
---|
80 | This is the inverse of ``unicode_to_argv``. |
---|
81 | """ |
---|
82 | if isinstance(s, str): |
---|
83 | return s |
---|
84 | |
---|
85 | precondition(isinstance(s, bytes), s) |
---|
86 | |
---|
87 | try: |
---|
88 | return str(s, io_encoding) |
---|
89 | except UnicodeDecodeError: |
---|
90 | raise usage.UsageError("Argument %s cannot be decoded as %s." % |
---|
91 | (quote_output(s), io_encoding)) |
---|
92 | |
---|
93 | def argv_to_abspath(s, **kwargs): |
---|
94 | """ |
---|
95 | Convenience function to decode an argv element to an absolute path, with ~ expanded. |
---|
96 | If this fails, raise a UsageError. |
---|
97 | """ |
---|
98 | decoded = argv_to_unicode(s) |
---|
99 | if decoded.startswith(u'-'): |
---|
100 | raise usage.UsageError("Path argument %s cannot start with '-'.\nUse %s if you intended to refer to a file." |
---|
101 | % (quote_output(s), quote_output(os.path.join('.', s)))) |
---|
102 | return abspath_expanduser_unicode(decoded, **kwargs) |
---|
103 | |
---|
104 | |
---|
105 | def unicode_to_argv(s): |
---|
106 | """ |
---|
107 | Make the given unicode string suitable for use in an argv list. |
---|
108 | |
---|
109 | On Python 2 on POSIX, this encodes using UTF-8. On Python 3 and on |
---|
110 | Windows, this returns the input unmodified. |
---|
111 | """ |
---|
112 | precondition(isinstance(s, str), s) |
---|
113 | warnings.warn("This is unnecessary.", DeprecationWarning) |
---|
114 | if sys.platform == "win32": |
---|
115 | return s |
---|
116 | return ensure_str(s) |
---|
117 | |
---|
118 | |
---|
119 | # According to unicode_to_argv above, the expected type for |
---|
120 | # cli args depends on the platform, so capture that expectation. |
---|
121 | argv_type = (str,) |
---|
122 | """ |
---|
123 | The expected type for args to a subprocess |
---|
124 | """ |
---|
125 | |
---|
126 | |
---|
127 | def unicode_to_url(s): |
---|
128 | """ |
---|
129 | Encode an unicode object used in an URL to bytes. |
---|
130 | """ |
---|
131 | # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded. |
---|
132 | |
---|
133 | # FIXME |
---|
134 | return to_bytes(s) |
---|
135 | #precondition(isinstance(s, unicode), s) |
---|
136 | #return s.encode('utf-8') |
---|
137 | |
---|
138 | def to_bytes(s): |
---|
139 | """Convert unicode to bytes. |
---|
140 | |
---|
141 | None and bytes are passed through unchanged. |
---|
142 | """ |
---|
143 | if s is None or isinstance(s, bytes): |
---|
144 | return s |
---|
145 | return s.encode('utf-8') |
---|
146 | |
---|
147 | def from_utf8_or_none(s): |
---|
148 | precondition(isinstance(s, bytes) or s is None, s) |
---|
149 | if s is None: |
---|
150 | return s |
---|
151 | return s.decode('utf-8') |
---|
152 | |
---|
153 | PRINTABLE_ASCII = re.compile(br'^[\n\r\x20-\x7E]*$', re.DOTALL) |
---|
154 | PRINTABLE_8BIT = re.compile(br'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) |
---|
155 | |
---|
156 | def is_printable_ascii(s): |
---|
157 | return PRINTABLE_ASCII.search(s) is not None |
---|
158 | |
---|
159 | def unicode_to_output(s): |
---|
160 | """ |
---|
161 | Encode an unicode object for representation on stdout or stderr. |
---|
162 | |
---|
163 | On Python 3 just returns the unicode string unchanged, since encoding is |
---|
164 | the responsibility of stdout/stderr, they expect Unicode by default. |
---|
165 | """ |
---|
166 | precondition(isinstance(s, str), s) |
---|
167 | warnings.warn("This is unnecessary.", DeprecationWarning) |
---|
168 | return s |
---|
169 | |
---|
170 | def _unicode_escape(m, quote_newlines): |
---|
171 | u = m.group(0) |
---|
172 | if u == u'"' or u == u'$' or u == u'`' or u == u'\\': |
---|
173 | return u'\\' + u |
---|
174 | elif u == u'\n' and not quote_newlines: |
---|
175 | return u |
---|
176 | if len(u) == 2: |
---|
177 | codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 |
---|
178 | else: |
---|
179 | codepoint = ord(u) |
---|
180 | if codepoint > 0xFFFF: |
---|
181 | return u'\\U%08x' % (codepoint,) |
---|
182 | elif codepoint > 0xFF: |
---|
183 | return u'\\u%04x' % (codepoint,) |
---|
184 | else: |
---|
185 | return u'\\x%02x' % (codepoint,) |
---|
186 | |
---|
187 | def _bytes_escape(m, quote_newlines): |
---|
188 | """ |
---|
189 | Takes a re match on bytes, the result is escaped bytes of group(0). |
---|
190 | """ |
---|
191 | c = m.group(0) |
---|
192 | if c == b'"' or c == b'$' or c == b'`' or c == b'\\': |
---|
193 | return b'\\' + c |
---|
194 | elif c == b'\n' and not quote_newlines: |
---|
195 | return c |
---|
196 | else: |
---|
197 | return b'\\x%02x' % (ord(c),) |
---|
198 | |
---|
199 | MUST_DOUBLE_QUOTE_NL = re.compile(u'[^\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) |
---|
200 | MUST_DOUBLE_QUOTE = re.compile(u'[^\\n\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) |
---|
201 | |
---|
202 | # if we must double-quote, then we have to escape ", $ and `, but need not escape ' |
---|
203 | ESCAPABLE_UNICODE = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs |
---|
204 | u'[^ !#\\x25-\\x5B\\x5D-\\x5F\\x61-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', |
---|
205 | re.DOTALL) |
---|
206 | |
---|
207 | ESCAPABLE_8BIT = re.compile( br'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) |
---|
208 | |
---|
209 | def quote_output_u(*args, **kwargs): |
---|
210 | """ |
---|
211 | Like ``quote_output`` but always return ``unicode``. |
---|
212 | """ |
---|
213 | result = quote_output(*args, **kwargs) |
---|
214 | if isinstance(result, str): |
---|
215 | return result |
---|
216 | # Since we're quoting, the assumption is this will be read by a human, and |
---|
217 | # therefore printed, so stdout's encoding is the plausible one. io_encoding |
---|
218 | # is now always utf-8. |
---|
219 | return result.decode(kwargs.get("encoding", None) or |
---|
220 | getattr(sys.stdout, "encoding") or io_encoding) |
---|
221 | |
---|
222 | |
---|
223 | def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None): |
---|
224 | """ |
---|
225 | Encode either a Unicode string or a UTF-8-encoded bytestring for representation |
---|
226 | on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is |
---|
227 | always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or |
---|
228 | control bytes in the output. (Newlines are counted as control bytes iff |
---|
229 | quote_newlines is True.) |
---|
230 | |
---|
231 | Quoting may use either single or double quotes. Within single quotes, all |
---|
232 | characters stand for themselves, and ' will not appear. Within double quotes, |
---|
233 | Python-compatible backslash escaping is used. |
---|
234 | |
---|
235 | If not explicitly given, quote_newlines is True when quotemarks is True. |
---|
236 | |
---|
237 | On Python 3, returns Unicode strings. |
---|
238 | """ |
---|
239 | precondition(isinstance(s, (bytes, str)), s) |
---|
240 | # Since we're quoting, the assumption is this will be read by a human, and |
---|
241 | # therefore printed, so stdout's encoding is the plausible one. io_encoding |
---|
242 | # is now always utf-8. |
---|
243 | encoding = encoding or getattr(sys.stdout, "encoding") or io_encoding |
---|
244 | |
---|
245 | if quote_newlines is None: |
---|
246 | quote_newlines = quotemarks |
---|
247 | |
---|
248 | def _encode(s): |
---|
249 | if isinstance(s, bytes): |
---|
250 | try: |
---|
251 | s = s.decode("utf-8") |
---|
252 | except UnicodeDecodeError: |
---|
253 | return b'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _bytes_escape(m, quote_newlines), s),) |
---|
254 | |
---|
255 | must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE |
---|
256 | if must_double_quote.search(s) is None: |
---|
257 | try: |
---|
258 | out = s.encode(encoding) |
---|
259 | if quotemarks or out.startswith(b'"'): |
---|
260 | return b"'%s'" % (out,) |
---|
261 | else: |
---|
262 | return out |
---|
263 | except (UnicodeDecodeError, UnicodeEncodeError): |
---|
264 | pass |
---|
265 | |
---|
266 | escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s) |
---|
267 | return b'"%s"' % (escaped.encode(encoding, 'backslashreplace'),) |
---|
268 | |
---|
269 | result = _encode(s) |
---|
270 | result = result.decode(encoding) |
---|
271 | return result |
---|
272 | |
---|
273 | |
---|
274 | def quote_path(path, quotemarks=True): |
---|
275 | return quote_output(b"/".join(map(to_bytes, path)), quotemarks=quotemarks, quote_newlines=True) |
---|
276 | |
---|
277 | def quote_local_unicode_path(path, quotemarks=True): |
---|
278 | precondition(isinstance(path, str), path) |
---|
279 | |
---|
280 | if sys.platform == "win32" and path.startswith(u"\\\\?\\"): |
---|
281 | path = path[4 :] |
---|
282 | if path.startswith(u"UNC\\"): |
---|
283 | path = u"\\\\" + path[4 :] |
---|
284 | |
---|
285 | return quote_output(path, quotemarks=quotemarks, quote_newlines=True) |
---|
286 | |
---|
287 | def quote_filepath(path, quotemarks=True): |
---|
288 | return quote_local_unicode_path(unicode_from_filepath(path), quotemarks=quotemarks) |
---|
289 | |
---|
290 | def extend_filepath(fp, segments): |
---|
291 | # We cannot use FilePath.preauthChild, because |
---|
292 | # * it has the security flaw described in <https://twistedmatrix.com/trac/ticket/6527>; |
---|
293 | # * it may return a FilePath in the wrong mode. |
---|
294 | |
---|
295 | for segment in segments: |
---|
296 | fp = fp.child(segment) |
---|
297 | |
---|
298 | return fp |
---|
299 | |
---|
300 | def to_filepath(path): |
---|
301 | precondition(isinstance(path, str), path=path) |
---|
302 | |
---|
303 | if sys.platform == "win32": |
---|
304 | _assert(isinstance(path, str), path=path) |
---|
305 | if path.startswith(u"\\\\?\\") and len(path) > 4: |
---|
306 | # FilePath normally strips trailing path separators, but not in this case. |
---|
307 | path = path.rstrip(u"\\") |
---|
308 | |
---|
309 | return FilePath(path) |
---|
310 | |
---|
311 | def _decode(s): |
---|
312 | precondition(isinstance(s, (bytes, str)), s=s) |
---|
313 | |
---|
314 | if isinstance(s, bytes): |
---|
315 | return s.decode(filesystem_encoding) |
---|
316 | else: |
---|
317 | return s |
---|
318 | |
---|
319 | def unicode_from_filepath(fp): |
---|
320 | precondition(isinstance(fp, FilePath), fp=fp) |
---|
321 | return _decode(fp.path) |
---|
322 | |
---|
323 | def unicode_segments_from(base_fp, ancestor_fp): |
---|
324 | precondition(isinstance(base_fp, FilePath), base_fp=base_fp) |
---|
325 | precondition(isinstance(ancestor_fp, FilePath), ancestor_fp=ancestor_fp) |
---|
326 | |
---|
327 | return base_fp.asTextMode().segmentsFrom(ancestor_fp.asTextMode()) |
---|
328 | |
---|
329 | def unicode_platform(): |
---|
330 | """ |
---|
331 | Does the current platform handle Unicode filenames natively? |
---|
332 | """ |
---|
333 | return True |
---|
334 | |
---|
335 | class FilenameEncodingError(Exception): |
---|
336 | """ |
---|
337 | Filename cannot be encoded using the current encoding of your filesystem |
---|
338 | (%s). Please configure your locale correctly or rename this file. |
---|
339 | """ |
---|
340 | pass |
---|
341 | |
---|
342 | def listdir_unicode(path): |
---|
343 | """ |
---|
344 | Wrapper around listdir() which provides safe access to the convenient |
---|
345 | Unicode API even under platforms that don't provide one natively. |
---|
346 | """ |
---|
347 | precondition(isinstance(path, str), path) |
---|
348 | return os.listdir(path) |
---|
349 | |
---|
350 | def listdir_filepath(fp): |
---|
351 | return listdir_unicode(unicode_from_filepath(fp)) |
---|
352 | |
---|
353 | |
---|
354 | # 'x' at the end of a variable name indicates that it holds a Unicode string that may not |
---|
355 | # be NFC-normalized. |
---|
356 | def normalize(namex): |
---|
357 | return unicodedata.normalize('NFC', namex) |
---|