[python3/f13/master] - Backport two fixes from the f14 branch: - Fix for lone surrogates, utf8 and certain encode error h
Toshio くらとみ
toshio at fedoraproject.org
Sun Aug 22 23:47:40 UTC 2010
commit 211f42f6b6db898f6595e477212eaf27181d2f30
Author: Toshio Kuratomi <toshio at fedoraproject.org>
Date: Fri Aug 20 15:07:00 2010 -0400
- Backport two fixes from the f14 branch:
- Fix for lone surrogates, utf8 and certain encode error handlers.
- Fix for segfault in pyexpat
...382-lone-surrogate-and-utf8-error-handler.patch | 175 ++++++++++++++++++++
python3.spec | 22 +++-
2 files changed, 196 insertions(+), 1 deletions(-)
---
diff --git a/python3-r80382-lone-surrogate-and-utf8-error-handler.patch b/python3-r80382-lone-surrogate-and-utf8-error-handler.patch
new file mode 100644
index 0000000..b4b59f2
--- /dev/null
+++ b/python3-r80382-lone-surrogate-and-utf8-error-handler.patch
@@ -0,0 +1,175 @@
+Index: Objects/unicodeobject.c
+===================================================================
+--- Objects/unicodeobject.c (revision 80382)
++++ Objects/unicodeobject.c (revision 80383)
+@@ -159,6 +159,12 @@
+ const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
+ Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
+
++static void raise_encode_exception(PyObject **exceptionObject,
++ const char *encoding,
++ const Py_UNICODE *unicode, Py_ssize_t size,
++ Py_ssize_t startpos, Py_ssize_t endpos,
++ const char *reason);
++
+ /* Same for linebreaks */
+ static unsigned char ascii_linebreak[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+@@ -2461,61 +2467,88 @@
+ /* Encode Latin-1 */
+ *p++ = (char)(0xc0 | (ch >> 6));
+ *p++ = (char)(0x80 | (ch & 0x3f));
+- }
+- else {
+- /* Encode UCS2 Unicode ordinals */
+- if (ch < 0x10000) {
++ } else if (0xD800 <= ch && ch <= 0xDFFF) {
+ #ifndef Py_UNICODE_WIDE
+- /* Special case: check for high surrogate */
+- if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
+- Py_UCS4 ch2 = s[i];
+- /* Check for low surrogate and combine the two to
+- form a UCS4 value */
+- if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+- ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
+- i++;
+- goto encodeUCS4;
+- }
+- /* Fall through: handles isolated high surrogates */
+- }
++ /* Special case: check for high and low surrogate */
++ if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
++ Py_UCS4 ch2 = s[i];
++ /* Combine the two surrogates to form a UCS4 value */
++ ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
++ i++;
++
++ /* Encode UCS4 Unicode ordinals */
++ *p++ = (char)(0xf0 | (ch >> 18));
++ *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
++ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
++ *p++ = (char)(0x80 | (ch & 0x3f));
++
+ #endif
+- if (ch >= 0xd800 && ch <= 0xdfff) {
+- Py_ssize_t newpos;
+- PyObject *rep;
+- char *prep;
+- int k;
+- rep = unicode_encode_call_errorhandler
+- (errors, &errorHandler, "utf-8", "surrogates not allowed",
+- s, size, &exc, i-1, i, &newpos);
+- if (!rep)
++ } else {
++ Py_ssize_t newpos;
++ PyObject *rep;
++ Py_ssize_t repsize, k;
++ rep = unicode_encode_call_errorhandler
++ (errors, &errorHandler, "utf-8", "surrogates not allowed",
++ s, size, &exc, i-1, i, &newpos);
++ if (!rep)
++ goto error;
++
++ if (PyBytes_Check(rep))
++ repsize = PyBytes_GET_SIZE(rep);
++ else
++ repsize = PyUnicode_GET_SIZE(rep);
++
++ if (repsize > 4) {
++ Py_ssize_t offset;
++
++ if (result == NULL)
++ offset = p - stackbuf;
++ else
++ offset = p - PyBytes_AS_STRING(result);
++
++ if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
++ /* integer overflow */
++ PyErr_NoMemory();
+ goto error;
+- /* Implementation limitations: only support error handler that return
+- bytes, and only support up to four replacement bytes. */
+- if (!PyBytes_Check(rep)) {
+- PyErr_SetString(PyExc_TypeError, "error handler should have returned bytes");
+- Py_DECREF(rep);
+- goto error;
+ }
+- if (PyBytes_Size(rep) > 4) {
+- PyErr_SetString(PyExc_TypeError, "error handler returned too many bytes");
+- Py_DECREF(rep);
+- goto error;
++ nallocated += repsize - 4;
++ if (result != NULL) {
++ if (_PyBytes_Resize(&result, nallocated) < 0)
++ goto error;
++ } else {
++ result = PyBytes_FromStringAndSize(NULL, nallocated);
++ if (result == NULL)
++ goto error;
++ Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
+ }
+- prep = PyBytes_AsString(rep);
+- for(k = PyBytes_Size(rep); k > 0; k--)
++ p = PyBytes_AS_STRING(result) + offset;
++ }
++
++ if (PyBytes_Check(rep)) {
++ char *prep = PyBytes_AS_STRING(rep);
++ for(k = repsize; k > 0; k--)
+ *p++ = *prep++;
+- Py_DECREF(rep);
+- continue;
+-
++ } else /* rep is unicode */ {
++ Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
++ Py_UNICODE c;
++
++ for(k=0; k<repsize; k++) {
++ c = prep[k];
++ if (0x80 <= c) {
++ raise_encode_exception(&exc, "utf-8", s, size,
++ i-1, i, "surrogates not allowed");
++ goto error;
++ }
++ *p++ = (char)prep[k];
++ }
+ }
+- *p++ = (char)(0xe0 | (ch >> 12));
+- *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+- *p++ = (char)(0x80 | (ch & 0x3f));
+- continue;
++ Py_DECREF(rep);
+ }
+-#ifndef Py_UNICODE_WIDE
+- encodeUCS4:
+-#endif
++ } else if (ch < 0x10000) {
++ *p++ = (char)(0xe0 | (ch >> 12));
++ *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
++ *p++ = (char)(0x80 | (ch & 0x3f));
++ } else /* ch >= 0x10000 */ {
+ /* Encode UCS4 Unicode ordinals */
+ *p++ = (char)(0xf0 | (ch >> 18));
+ *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+Index: Lib/test/test_codecs.py
+===================================================================
+--- Lib/test/test_codecs.py (revision 80382)
++++ Lib/test/test_codecs.py (revision 80383)
+@@ -571,6 +571,16 @@
+ def test_lone_surrogates(self):
+ self.assertRaises(UnicodeEncodeError, "\ud800".encode, "utf-8")
+ self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "utf-8")
++ self.assertEqual("[\uDC80]".encode("utf-8", "backslashreplace"),
++ b'[\\udc80]')
++ self.assertEqual("[\uDC80]".encode("utf-8", "xmlcharrefreplace"),
++ b'[�]')
++ self.assertEqual("[\uDC80]".encode("utf-8", "surrogateescape"),
++ b'[\x80]')
++ self.assertEqual("[\uDC80]".encode("utf-8", "ignore"),
++ b'[]')
++ self.assertEqual("[\uDC80]".encode("utf-8", "replace"),
++ b'[?]')
+
+ def test_surrogatepass_handler(self):
+ self.assertEquals("abc\ud800def".encode("utf-8", "surrogatepass"),
+
diff --git a/python3.spec b/python3.spec
index 8bfe837..7120945 100644
--- a/python3.spec
+++ b/python3.spec
@@ -39,7 +39,7 @@
Summary: Version 3 of the Python programming language aka Python 3000
Name: python3
Version: %{pybasever}.2
-Release: 6%{?dist}
+Release: 7%{?dist}
License: Python
Group: Development/Languages
Source: http://python.org/ftp/python/%{version}/Python-%{version}.tar.bz2
@@ -144,6 +144,16 @@ Patch108: python-3.1.2-CVE-2010-2089.patch
# the old layout before the whitespace cleanup of release31-maint in r81033):
Patch109: python-3.1.2-CVE-2008-5983.patch
+# Fix an incompatibility between pyexpat and the system expat-2.0.1 that led to
+# a segfault running test_pyexpat.py (rhbz:610312)
+# Sent upstream as http://bugs.python.org/issue9054
+Patch110: python-3.1.2-fix-expat-issue9054.patch
+
+# Fix encoding to utf8 when lone surrogates are present and error handler is
+# set to ignore, replace, or others that return a unicode str.
+# http://bugs.python.org/issue8092
+Patch111: python3-r80382-lone-surrogate-and-utf8-error-handler.patch
+
BuildRoot: %{_tmppath}/%{name}-%{version}-root
BuildRequires: readline-devel, openssl-devel, gmp-devel
BuildRequires: ncurses-devel, gdbm-devel, zlib-devel, expat-devel
@@ -266,6 +276,10 @@ rm -r Modules/zlib || exit 1
%patch108 -p1 -b .CVE-2010-2089
%patch109 -p1 -b .CVE-2008-5983
+%patch110 -p0 -b .fix-expat-issue9054
+
+%patch111 -p0 -b .surrogate-utf8
+
# Currently (2010-01-15), http://docs.python.org/library is for 2.6, and there
# are many differences between 2.6 and the Python 3 library.
#
@@ -717,6 +731,12 @@ rm -fr %{buildroot}
%changelog
+* Sun Aug 22 2010 Toshio Kuratomi <toshio at fedoraproject.org> - 3.1.2-7
+- Backport from F14:
+ - Fix for lone surrogates, utf8 and certain encode error handlers.
+ - Fix an incompatibility between pyexpat and the system expat-2.0.1 that led to
+ a segfault running test_pyexpat.py (patch 110; upstream issue 9054; rhbz#610312)
+
* Fri Jun 4 2010 David Malcolm <dmalcolm at redhat.com> - 3.1.2-6
- ensure that the compiler is invoked with "-fwrapv" (rhbz#594819)
- reformat whitespace in audioop.c (patch 106)
More information about the scm-commits
mailing list