1 | Fri Jul 23 08:53:14 GMT Daylight Time 2010 david-sarah@jacaranda.org |
---|
2 | * util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135 |
---|
3 | |
---|
4 | New patches: |
---|
5 | |
---|
6 | [util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135 |
---|
7 | david-sarah@jacaranda.org**20100723075314 |
---|
8 | Ignore-this: b82205834d17db61612dd16436b7c5a2 |
---|
9 | ] { |
---|
10 | hunk ./src/allmydata/test/test_encodingutil.py 60 |
---|
11 | |
---|
12 | from allmydata.test.common_util import ReallyEqualMixin |
---|
13 | from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \ |
---|
14 | - unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \ |
---|
15 | - get_output_encoding, get_filesystem_encoding, _reload |
---|
16 | + unicode_to_output, quote_output, unicode_platform, listdir_unicode, \ |
---|
17 | + FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload |
---|
18 | from allmydata.dirnode import normalize |
---|
19 | |
---|
20 | from twisted.python import usage |
---|
21 | hunk ./src/allmydata/test/test_encodingutil.py 289 |
---|
22 | self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb') |
---|
23 | |
---|
24 | |
---|
25 | +class QuoteOutput(ReallyEqualMixin, unittest.TestCase): |
---|
26 | + def _check(self, inp, out, enc, optional_quotes): |
---|
27 | + out2 = out |
---|
28 | + if optional_quotes: |
---|
29 | + out2 = out2[1:-1] |
---|
30 | + self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out) |
---|
31 | + self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2) |
---|
32 | + if out[0:2] != 'b"': |
---|
33 | + if isinstance(inp, str): |
---|
34 | + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out) |
---|
35 | + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2) |
---|
36 | + else: |
---|
37 | + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out) |
---|
38 | + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2) |
---|
39 | + |
---|
40 | + def _test_quote_output_all(self, enc): |
---|
41 | + def check(inp, out, optional_quotes=False): |
---|
42 | + self._check(inp, out, enc, optional_quotes) |
---|
43 | + |
---|
44 | + # optional single quotes |
---|
45 | + check("foo", "'foo'", True) |
---|
46 | + check("\\", "'\\'", True) |
---|
47 | + check("$\"`", "'$\"`'", True) |
---|
48 | + |
---|
49 | + # mandatory single quotes |
---|
50 | + check("\"", "'\"'") |
---|
51 | + |
---|
52 | + # double quotes |
---|
53 | + check("'", "\"'\"") |
---|
54 | + check("\n", "\"\\x0a\"") |
---|
55 | + check("\x00", "\"\\x00\"") |
---|
56 | + |
---|
57 | + # invalid Unicode and astral planes |
---|
58 | + check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"") |
---|
59 | + check(u"\uDC00\uD800", "\"\\udc00\\ud800\"") |
---|
60 | + check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"") |
---|
61 | + check(u"\uD800\uDC00", "\"\\U00010000\"") |
---|
62 | + check(u"\uD800\uDC01", "\"\\U00010001\"") |
---|
63 | + check(u"\uD801\uDC00", "\"\\U00010400\"") |
---|
64 | + check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"") |
---|
65 | + check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"") |
---|
66 | + check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"") |
---|
67 | + |
---|
68 | + # invalid UTF-8 |
---|
69 | + check("\xFF", "b\"\\xff\"") |
---|
70 | + check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") |
---|
71 | + |
---|
72 | + def test_quote_output_ascii(self, enc='ascii'): |
---|
73 | + def check(inp, out, optional_quotes=False): |
---|
74 | + self._check(inp, out, enc, optional_quotes) |
---|
75 | + |
---|
76 | + self._test_quote_output_all(enc) |
---|
77 | + check(u"\u00D7", "\"\\xd7\"") |
---|
78 | + check(u"'\u00D7", "\"'\\xd7\"") |
---|
79 | + check(u"\"\u00D7", "\"\\\"\\xd7\"") |
---|
80 | + check(u"\u2621", "\"\\u2621\"") |
---|
81 | + check(u"'\u2621", "\"'\\u2621\"") |
---|
82 | + check(u"\"\u2621", "\"\\\"\\u2621\"") |
---|
83 | + |
---|
84 | + def test_quote_output_latin1(self, enc='latin1'): |
---|
85 | + def check(inp, out, optional_quotes=False): |
---|
86 | + self._check(inp, out.encode('latin1'), enc, optional_quotes) |
---|
87 | + |
---|
88 | + self._test_quote_output_all(enc) |
---|
89 | + check(u"\u00D7", u"'\u00D7'", True) |
---|
90 | + check(u"'\u00D7", u"\"'\u00D7\"") |
---|
91 | + check(u"\"\u00D7", u"'\"\u00D7'") |
---|
92 | + check(u"\u00D7\"", u"'\u00D7\"'", True) |
---|
93 | + check(u"\u2621", u"\"\\u2621\"") |
---|
94 | + check(u"'\u2621", u"\"'\\u2621\"") |
---|
95 | + check(u"\"\u2621", u"\"\\\"\\u2621\"") |
---|
96 | + |
---|
97 | + def test_quote_output_utf8(self, enc='utf-8'): |
---|
98 | + def check(inp, out, optional_quotes=False): |
---|
99 | + self._check(inp, out.encode('utf-8'), enc, optional_quotes) |
---|
100 | + |
---|
101 | + self._test_quote_output_all(enc) |
---|
102 | + check(u"\u2621", u"'\u2621'", True) |
---|
103 | + check(u"'\u2621", u"\"'\u2621\"") |
---|
104 | + check(u"\"\u2621", u"'\"\u2621'") |
---|
105 | + check(u"\u2621\"", u"'\u2621\"'", True) |
---|
106 | + |
---|
107 | + @patch('sys.stdout') |
---|
108 | + def test_quote_output_mock(self, mock_stdout): |
---|
109 | + mock_stdout.encoding = 'ascii' |
---|
110 | + _reload() |
---|
111 | + self.test_quote_output_ascii(None) |
---|
112 | + |
---|
113 | + mock_stdout.encoding = 'latin1' |
---|
114 | + _reload() |
---|
115 | + self.test_quote_output_latin1(None) |
---|
116 | + |
---|
117 | + mock_stdout.encoding = 'utf-8' |
---|
118 | + _reload() |
---|
119 | + self.test_quote_output_utf8(None) |
---|
120 | + |
---|
121 | + |
---|
122 | class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase): |
---|
123 | uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' |
---|
124 | output = 'lumi\xc3\xa8re' |
---|
125 | hunk ./src/allmydata/util/encodingutil.py 115 |
---|
126 | return s |
---|
127 | return s.encode(argv_encoding) |
---|
128 | |
---|
129 | -PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL) |
---|
130 | -PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL) |
---|
131 | +PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL) |
---|
132 | +PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) |
---|
133 | |
---|
134 | def is_printable_ascii(s): |
---|
135 | return PRINTABLE_ASCII.search(s) is not None |
---|
136 | hunk ./src/allmydata/util/encodingutil.py 140 |
---|
137 | (output_encoding, repr(s))) |
---|
138 | return out |
---|
139 | |
---|
140 | + |
---|
141 | +def _unicode_escape(m): |
---|
142 | + u = m.group(0) |
---|
143 | + if u == '"' or u == '$' or u == '`' or u == '\\': |
---|
144 | + return u'\\' + u |
---|
145 | + if len(u) == 2: |
---|
146 | + codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 |
---|
147 | + else: |
---|
148 | + codepoint = ord(u) |
---|
149 | + if codepoint > 0xFFFF: |
---|
150 | + return u'\\U%08x' % (codepoint,) |
---|
151 | + elif codepoint > 0xFF: |
---|
152 | + return u'\\u%04x' % (codepoint,) |
---|
153 | + else: |
---|
154 | + return u'\\x%02x' % (codepoint,) |
---|
155 | + |
---|
156 | +def _str_escape(m): |
---|
157 | + c = m.group(0) |
---|
158 | + if c == '"' or c == '$' or c == '`' or c == '\\': |
---|
159 | + return '\\' + c |
---|
160 | + else: |
---|
161 | + return '\\x%02x' % (ord(c),) |
---|
162 | + |
---|
163 | +MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) |
---|
164 | + |
---|
165 | +# if we must double-quote, then we have to escape ", $ and `, but need not escape ' |
---|
166 | +ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs |
---|
167 | + ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', |
---|
168 | + re.DOTALL) |
---|
169 | + |
---|
170 | +ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) |
---|
171 | + |
---|
172 | def quote_output(s, quotemarks=True, encoding=None): |
---|
173 | """ |
---|
174 | Encode either a Unicode string or a UTF-8-encoded bytestring for representation |
---|
175 | hunk ./src/allmydata/util/encodingutil.py 176 |
---|
176 | on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is |
---|
177 | - always surrounded by single quotes; otherwise, it is quoted only if necessary to |
---|
178 | - avoid ambiguity or control bytes in the output. |
---|
179 | + always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or |
---|
180 | + control bytes in the output. |
---|
181 | + Quoting may use either single or double quotes. Within single quotes, all |
---|
182 | + characters stand for themselves, and ' will not appear. Within double quotes, |
---|
183 | + Python-compatible backslash escaping is used. |
---|
184 | """ |
---|
185 | precondition(isinstance(s, (str, unicode)), s) |
---|
186 | |
---|
187 | hunk ./src/allmydata/util/encodingutil.py 188 |
---|
188 | try: |
---|
189 | s = s.decode('utf-8') |
---|
190 | except UnicodeDecodeError: |
---|
191 | - return 'b' + repr(s) |
---|
192 | - |
---|
193 | - try: |
---|
194 | - out = s.encode(encoding or output_encoding) |
---|
195 | - except (UnicodeEncodeError, UnicodeDecodeError): |
---|
196 | - return repr(s) |
---|
197 | + return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),) |
---|
198 | |
---|
199 | hunk ./src/allmydata/util/encodingutil.py 190 |
---|
200 | - if PRINTABLE_8BIT.search(out) is None: |
---|
201 | - return repr(out) |
---|
202 | + if MUST_DOUBLE_QUOTE.search(s) is None: |
---|
203 | + try: |
---|
204 | + out = s.encode(encoding or output_encoding) |
---|
205 | + if quotemarks or out.startswith('"'): |
---|
206 | + return "'%s'" % (out,) |
---|
207 | + else: |
---|
208 | + return out |
---|
209 | + except (UnicodeDecodeError, UnicodeEncodeError): |
---|
210 | + pass |
---|
211 | |
---|
212 | hunk ./src/allmydata/util/encodingutil.py 200 |
---|
213 | - if quotemarks: |
---|
214 | - return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'" |
---|
215 | - else: |
---|
216 | - return out |
---|
217 | + escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s) |
---|
218 | + return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),) |
---|
219 | |
---|
220 | def quote_path(path, quotemarks=True): |
---|
221 | return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks) |
---|
222 | } |
---|
223 | |
---|
224 | Context: |
---|
225 | |
---|
226 | [docs/specifications/dirnodes.txt: 'mesh'->'grid'. |
---|
227 | david-sarah@jacaranda.org**20100723061616 |
---|
228 | Ignore-this: 887bcf921ef00afba8e05e9239035bca |
---|
229 | ] |
---|
230 | [docs: use current cap to Zooko's wiki page in example text |
---|
231 | zooko@zooko.com**20100721010543 |
---|
232 | Ignore-this: 4f36f36758f9fdbaf9eb73eac23b6652 |
---|
233 | fixes #1134 |
---|
234 | ] |
---|
235 | [docs/specifications/dirnodes.txt: bring layer terminology up-to-date with architecture.txt, and a few other updates (e.g. note that the MAC is no longer verified, and that URIs can be unknown). Also 'Tahoe'->'Tahoe-LAFS'. |
---|
236 | david-sarah@jacaranda.org**20100723054703 |
---|
237 | Ignore-this: f3b98183e7d0a0f391225b8b93ac6c37 |
---|
238 | ] |
---|
239 | [__init__.py: silence DeprecationWarning about BaseException.message globally. fixes #1129 |
---|
240 | david-sarah@jacaranda.org**20100720011939 |
---|
241 | Ignore-this: 38808986ba79cb2786b010504a22f89 |
---|
242 | ] |
---|
243 | [test_runner: test that 'tahoe --version' outputs no noise (e.g. DeprecationWarnings). |
---|
244 | david-sarah@jacaranda.org**20100720011345 |
---|
245 | Ignore-this: dd358b7b2e5d57282cbe133e8069702e |
---|
246 | ] |
---|
247 | [TAG allmydata-tahoe-1.7.1 |
---|
248 | zooko@zooko.com**20100719131352 |
---|
249 | Ignore-this: 6942056548433dc653a746703819ad8c |
---|
250 | ] |
---|
251 | Patch bundle hash: |
---|
252 | d4aa6ac35c5dba44996999385ca90717c2525a3e |
---|