source: trunk/src/allmydata/util/encodingutil.py

Last change on this file was 1504bec, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-03-11T20:57:36Z

drop dead code

  • Property mode set to 100644
File size: 11.4 KB
Line 
1"""
2Functions used to convert inputs from whatever encoding used in the system to
3unicode and back.
4
5Ported to Python 3.
6
7Once Python 2 support is dropped, most of this module will obsolete, since
8Unicode is the default everywhere in Python 3.
9"""
10
11from six import ensure_str
12
13import sys, os, re
14import unicodedata
15import warnings
16
17from allmydata.util.assertutil import precondition, _assert
18from twisted.python import usage
19from twisted.python.filepath import FilePath
20from allmydata.util import log
21from allmydata.util.fileutil import abspath_expanduser_unicode
22
23NoneType = type(None)
24
25
26def canonical_encoding(encoding):
27    if encoding is None:
28        log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
29        encoding = 'utf-8'
30    encoding = encoding.lower()
31    if encoding == "cp65001":
32        encoding = 'utf-8'
33    elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
34        encoding = 'ascii'
35
36    return encoding
37
38def check_encoding(encoding):
39    # sometimes Python returns an encoding name that it doesn't support for conversion
40    # fail early if this happens
41    try:
42        u"test".encode(encoding)
43    except (LookupError, AttributeError):
44        raise AssertionError(
45            "The character encoding '%s' is not supported for conversion." % (encoding,),
46        )
47
48# On Windows we install UTF-8 stream wrappers for sys.stdout and
49# sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
50#
51# On POSIX, we are moving towards a UTF-8-everything and ignore the locale.
52io_encoding = "utf-8"
53
54filesystem_encoding = None
55
56def _reload():
57    global filesystem_encoding
58    filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
59    check_encoding(filesystem_encoding)
60
61_reload()
62
63
64def get_filesystem_encoding():
65    """
66    Returns expected encoding for local filenames.
67    """
68    return filesystem_encoding
69
70def get_io_encoding():
71    """
72    Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
73    """
74    return io_encoding
75
76def argv_to_unicode(s):
77    """
78    Decode given argv element to unicode. If this fails, raise a UsageError.
79
80    This is the inverse of ``unicode_to_argv``.
81    """
82    if isinstance(s, str):
83        return s
84
85    precondition(isinstance(s, bytes), s)
86
87    try:
88        return str(s, io_encoding)
89    except UnicodeDecodeError:
90        raise usage.UsageError("Argument %s cannot be decoded as %s." %
91                               (quote_output(s), io_encoding))
92
93def argv_to_abspath(s, **kwargs):
94    """
95    Convenience function to decode an argv element to an absolute path, with ~ expanded.
96    If this fails, raise a UsageError.
97    """
98    decoded = argv_to_unicode(s)
99    if decoded.startswith(u'-'):
100        raise usage.UsageError("Path argument %s cannot start with '-'.\nUse %s if you intended to refer to a file."
101                               % (quote_output(s), quote_output(os.path.join('.', s))))
102    return abspath_expanduser_unicode(decoded, **kwargs)
103
104
105def unicode_to_argv(s):
106    """
107    Make the given unicode string suitable for use in an argv list.
108
109    On Python 2 on POSIX, this encodes using UTF-8.  On Python 3 and on
110    Windows, this returns the input unmodified.
111    """
112    precondition(isinstance(s, str), s)
113    warnings.warn("This is unnecessary.", DeprecationWarning)
114    if sys.platform == "win32":
115        return s
116    return ensure_str(s)
117
118
119# According to unicode_to_argv above, the expected type for
120# cli args depends on the platform, so capture that expectation.
121argv_type = (str,)
122"""
123The expected type for args to a subprocess
124"""
125
126
127def unicode_to_url(s):
128    """
129    Encode an unicode object used in an URL to bytes.
130    """
131    # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
132
133    # FIXME
134    return to_bytes(s)
135    #precondition(isinstance(s, unicode), s)
136    #return s.encode('utf-8')
137
138def to_bytes(s):
139    """Convert unicode to bytes.
140
141    None and bytes are passed through unchanged.
142    """
143    if s is None or isinstance(s, bytes):
144        return s
145    return s.encode('utf-8')
146
147def from_utf8_or_none(s):
148    precondition(isinstance(s, bytes) or s is None, s)
149    if s is None:
150        return s
151    return s.decode('utf-8')
152
153PRINTABLE_ASCII = re.compile(br'^[\n\r\x20-\x7E]*$',          re.DOTALL)
154PRINTABLE_8BIT  = re.compile(br'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
155
156def is_printable_ascii(s):
157    return PRINTABLE_ASCII.search(s) is not None
158
159def unicode_to_output(s):
160    """
161    Encode an unicode object for representation on stdout or stderr.
162
163    On Python 3 just returns the unicode string unchanged, since encoding is
164    the responsibility of stdout/stderr, they expect Unicode by default.
165    """
166    precondition(isinstance(s, str), s)
167    warnings.warn("This is unnecessary.", DeprecationWarning)
168    return s
169
170def _unicode_escape(m, quote_newlines):
171    u = m.group(0)
172    if u == u'"' or u == u'$' or u == u'`' or u == u'\\':
173        return u'\\' + u
174    elif u == u'\n' and not quote_newlines:
175        return u
176    if len(u) == 2:
177        codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
178    else:
179        codepoint = ord(u)
180    if codepoint > 0xFFFF:
181        return u'\\U%08x' % (codepoint,)
182    elif codepoint > 0xFF:
183        return u'\\u%04x' % (codepoint,)
184    else:
185        return u'\\x%02x' % (codepoint,)
186
187def _bytes_escape(m, quote_newlines):
188    """
189    Takes a re match on bytes, the result is escaped bytes of group(0).
190    """
191    c = m.group(0)
192    if c == b'"' or c == b'$' or c == b'`' or c == b'\\':
193        return b'\\' + c
194    elif c == b'\n' and not quote_newlines:
195        return c
196    else:
197        return b'\\x%02x' % (ord(c),)
198
199MUST_DOUBLE_QUOTE_NL = re.compile(u'[^\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
200MUST_DOUBLE_QUOTE    = re.compile(u'[^\\n\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
201
202# if we must double-quote, then we have to escape ", $ and `, but need not escape '
203ESCAPABLE_UNICODE = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])|'  # valid surrogate pairs
204                               u'[^ !#\\x25-\\x5B\\x5D-\\x5F\\x61-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
205                               re.DOTALL)
206
207ESCAPABLE_8BIT    = re.compile( br'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
208
209def quote_output_u(*args, **kwargs):
210    """
211    Like ``quote_output`` but always return ``unicode``.
212    """
213    result = quote_output(*args, **kwargs)
214    if isinstance(result, str):
215        return result
216    # Since we're quoting, the assumption is this will be read by a human, and
217    # therefore printed, so stdout's encoding is the plausible one. io_encoding
218    # is now always utf-8.
219    return result.decode(kwargs.get("encoding", None) or
220                         getattr(sys.stdout, "encoding") or io_encoding)
221
222
223def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
224    """
225    Encode either a Unicode string or a UTF-8-encoded bytestring for representation
226    on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
227    always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
228    control bytes in the output. (Newlines are counted as control bytes iff
229    quote_newlines is True.)
230
231    Quoting may use either single or double quotes. Within single quotes, all
232    characters stand for themselves, and ' will not appear. Within double quotes,
233    Python-compatible backslash escaping is used.
234
235    If not explicitly given, quote_newlines is True when quotemarks is True.
236
237    On Python 3, returns Unicode strings.
238    """
239    precondition(isinstance(s, (bytes, str)), s)
240    # Since we're quoting, the assumption is this will be read by a human, and
241    # therefore printed, so stdout's encoding is the plausible one. io_encoding
242    # is now always utf-8.
243    encoding = encoding or getattr(sys.stdout, "encoding") or io_encoding
244
245    if quote_newlines is None:
246        quote_newlines = quotemarks
247
248    def _encode(s):
249        if isinstance(s, bytes):
250            try:
251                s = s.decode("utf-8")
252            except UnicodeDecodeError:
253                return b'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _bytes_escape(m, quote_newlines), s),)
254
255        must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
256        if must_double_quote.search(s) is None:
257            try:
258                out = s.encode(encoding)
259                if quotemarks or out.startswith(b'"'):
260                    return b"'%s'" % (out,)
261                else:
262                    return out
263            except (UnicodeDecodeError, UnicodeEncodeError):
264                pass
265
266        escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
267        return b'"%s"' % (escaped.encode(encoding, 'backslashreplace'),)
268
269    result = _encode(s)
270    result = result.decode(encoding)
271    return result
272
273
274def quote_path(path, quotemarks=True):
275    return quote_output(b"/".join(map(to_bytes, path)), quotemarks=quotemarks, quote_newlines=True)
276
277def quote_local_unicode_path(path, quotemarks=True):
278    precondition(isinstance(path, str), path)
279
280    if sys.platform == "win32" and path.startswith(u"\\\\?\\"):
281        path = path[4 :]
282        if path.startswith(u"UNC\\"):
283            path = u"\\\\" + path[4 :]
284
285    return quote_output(path, quotemarks=quotemarks, quote_newlines=True)
286
287def quote_filepath(path, quotemarks=True):
288    return quote_local_unicode_path(unicode_from_filepath(path), quotemarks=quotemarks)
289
290def extend_filepath(fp, segments):
291    # We cannot use FilePath.preauthChild, because
292    # * it has the security flaw described in <https://twistedmatrix.com/trac/ticket/6527>;
293    # * it may return a FilePath in the wrong mode.
294
295    for segment in segments:
296        fp = fp.child(segment)
297
298    return fp
299
300def to_filepath(path):
301    precondition(isinstance(path, str), path=path)
302
303    if sys.platform == "win32":
304        _assert(isinstance(path, str), path=path)
305        if path.startswith(u"\\\\?\\") and len(path) > 4:
306            # FilePath normally strips trailing path separators, but not in this case.
307            path = path.rstrip(u"\\")
308
309    return FilePath(path)
310
311def _decode(s):
312    precondition(isinstance(s, (bytes, str)), s=s)
313
314    if isinstance(s, bytes):
315        return s.decode(filesystem_encoding)
316    else:
317        return s
318
319def unicode_from_filepath(fp):
320    precondition(isinstance(fp, FilePath), fp=fp)
321    return _decode(fp.path)
322
323def unicode_segments_from(base_fp, ancestor_fp):
324    precondition(isinstance(base_fp, FilePath), base_fp=base_fp)
325    precondition(isinstance(ancestor_fp, FilePath), ancestor_fp=ancestor_fp)
326
327    return base_fp.asTextMode().segmentsFrom(ancestor_fp.asTextMode())
328
329def unicode_platform():
330    """
331    Does the current platform handle Unicode filenames natively?
332    """
333    return True
334
335class FilenameEncodingError(Exception):
336    """
337    Filename cannot be encoded using the current encoding of your filesystem
338    (%s). Please configure your locale correctly or rename this file.
339    """
340    pass
341
342def listdir_unicode(path):
343    """
344    Wrapper around listdir() which provides safe access to the convenient
345    Unicode API even under platforms that don't provide one natively.
346    """
347    precondition(isinstance(path, str), path)
348    return os.listdir(path)
349
350def listdir_filepath(fp):
351    return listdir_unicode(unicode_from_filepath(fp))
352
353
354# 'x' at the end of a variable name indicates that it holds a Unicode string that may not
355# be NFC-normalized.
356def normalize(namex):
357    return unicodedata.normalize('NFC', namex)
Note: See TracBrowser for help on using the repository browser.