Patch #1455898: Incremental mode for "mbcs" codec.

d825143b · Martin v. Löwis · 6ce9fe88 · d825143b · d825143b · d825143b
Commit d825143b authored Jun 14, 2006 by Martin v. Löwis
6 changed files
--- a/Doc/api/concrete.tex
+++ b/Doc/api/concrete.tex
@@ -1431,6 +1431,18 @@ machine running the codec.
  raised by the codec.
 \end{cfuncdesc}

+\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s,
+                                               int size,
+                                               const char *errors,
+                                               int *consumed}
+  If \var{consumed} is \NULL{}, behave like
+  \cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{},
+  \cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead
+  byte and the number of bytes that have been decoded will be stored in
+  \var{consumed}.
+  \versionadded{2.5}
+\end{cfuncdesc}
+
 \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
                                               Py_ssize_t size,
                                               const char *errors}

--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -938,6 +938,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
    const char *errors          /* error handling */
    );

+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
+    const char *string,         /* MBCS encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
 PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
    PyObject *unicode           /* Unicode object */
    );

--- a/Lib/encodings/mbcs.py
+++ b/Lib/encodings/mbcs.py
@@ -22,9 +22,10 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
    def encode(self, input, final=False):
        return codecs.mbcs_encode(input,self.errors)[0]

-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.mbcs_decode(input,self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.mbcs_decode(input,self.errors,final)
+
 class StreamWriter(Codec,codecs.StreamWriter):
    pass


--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -156,6 +156,9 @@ Extension Modules
 Library
 -------

+- Patch #1455898: The MBCS codec now supports the incremental mode for
+  double-byte encodings.
+
 - ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
  guarantee that adjacent triples in the return list always describe
  non-adjacent blocks.  Previously, a pair of matching blocks could end

--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -479,15 +479,20 @@ mbcs_decode(PyObject *self,
 	    PyObject *args)
 {
    const char *data;
-    Py_ssize_t size;
+    Py_ssize_t size, consumed;
    const char *errors = NULL;
+    int final = 1;
+    PyObject *decoded;

-    if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
-			  &data, &size, &errors))
+    if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
+			  &data, &size, &errors, &final))
 	return NULL;

-    return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
-		       size);
+    decoded = PyUnicode_DecodeMBCSStateful(
+	data, size, errors, final ? NULL : &consumed);
+    if (!decoded)
+	return NULL;
+    return codec_tuple(decoded, final ? size : consumed);
 }

 #endif /* MS_WINDOWS */

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)

 /* --- MBCS codecs for Windows -------------------------------------------- */

-PyObject *PyUnicode_DecodeMBCS(const char *s,
-				Py_ssize_t size,
-				const char *errors)
+#if SIZEOF_INT < SIZEOF_SSIZE_T
+#define NEED_RETRY
+#endif
+
+/* XXX This code is limited to "true" double-byte encodings, as
+   a) it assumes an incomplete character consists of a single byte, and
+   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
+      encodings, see IsDBCSLeadByteEx documentation. */
+
+static int is_dbcs_lead_byte(const char *s, int offset)
+{
+    const char *curr = s + offset;
+
+    if (IsDBCSLeadByte(*curr)) {
+	const char *prev = CharPrev(s, curr);
+	return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
+    }
+    return 0;
+}
+
+/*
+ * Decode MBCS string into unicode object. If 'final' is set, converts
+ * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
+ */
+static int decode_mbcs(PyUnicodeObject **v,
+			const char *s, /* MBCS string */
+			int size, /* sizeof MBCS string */
+			int final)
 {
-    PyUnicodeObject *v;
    Py_UNICODE *p;
-    DWORD usize;
+    Py_ssize_t n = 0;
+    int usize = 0;
+
+    assert(size >= 0);
+
+    /* Skip trailing lead-byte unless 'final' is set */
+    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
+	--size;

    /* First get the size of the result */
-    assert(size < INT_MAX);
-    usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
-    if (size > 0 && usize==0)
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
+	if (usize == 0) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }

-    v = _PyUnicode_New(usize);
-    if (v == NULL)
-        return NULL;
-    if (usize == 0)
-	return (PyObject *)v;
-    p = PyUnicode_AS_UNICODE(v);
-    if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
-        Py_DECREF(v);
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (*v == NULL) {
+	/* Create unicode object */
+	*v = _PyUnicode_New(usize);
+	if (*v == NULL)
+	    return -1;
+    }
+    else {
+	/* Extend unicode object */
+	n = PyUnicode_GET_SIZE(*v);
+	if (_PyUnicode_Resize(v, n + usize) < 0)
+	    return -1;
+    }
+
+    /* Do the conversion */
+    if (size > 0) {
+	p = PyUnicode_AS_UNICODE(*v) + n;
+	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }
+
+    return size;
+}
+
+PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
+					Py_ssize_t size,
+					const char *errors,
+					Py_ssize_t *consumed)
+{
+    PyUnicodeObject *v = NULL;
+    int done;
+
+    if (consumed)
+	*consumed = 0;
+
+#ifdef NEED_RETRY
+  retry:
+    if (size > INT_MAX)
+	done = decode_mbcs(&v, s, INT_MAX, 0);
+    else
+#endif
+	done = decode_mbcs(&v, s, (int)size, !consumed);
+
+    if (done < 0) {
+        Py_XDECREF(v);
+	return NULL;
+    }
+
+    if (consumed)
+	*consumed += done;
+
+#ifdef NEED_RETRY
+    if (size > INT_MAX) {
+	s += done;
+	size -= done;
+	goto retry;
    }
+#endif

    return (PyObject *)v;
 }

-PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+PyObject *PyUnicode_DecodeMBCS(const char *s,
 				Py_ssize_t size,
 				const char *errors)
 {
-    PyObject *repr;
-    char *s;
-    DWORD mbcssize;
+    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
+}

-    /* If there are no characters, bail now! */
-    if (size==0)
-	    return PyString_FromString("");
+/*
+ * Convert unicode into string object (MBCS).
+ * Returns 0 if succeed, -1 otherwise.
+ */
+static int encode_mbcs(PyObject **repr,
+			const Py_UNICODE *p, /* unicode */
+			int size) /* size of unicode */
+{
+    int mbcssize = 0;
+    Py_ssize_t n = 0;
+
+    assert(size >= 0);

    /* First get the size of the result */
-    assert(size<INT_MAX);
-    mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
-    if (mbcssize==0)
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
+	if (mbcssize == 0) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
+    }

-    repr = PyString_FromStringAndSize(NULL, mbcssize);
-    if (repr == NULL)
-        return NULL;
-    if (mbcssize == 0)
-        return repr;
+    if (*repr == NULL) {
+	/* Create string object */
+	*repr = PyString_FromStringAndSize(NULL, mbcssize);
+	if (*repr == NULL)
+	    return -1;
+    }
+    else {
+	/* Extend string object */
+	n = PyString_Size(*repr);
+	if (_PyString_Resize(repr, n + mbcssize) < 0)
+	    return -1;
+    }

    /* Do the conversion */
-    s = PyString_AS_STRING(repr);
-    assert(size < INT_MAX);
-    if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
-        Py_DECREF(repr);
-        return PyErr_SetFromWindowsErrWithFilename(0, NULL);
+    if (size > 0) {
+	char *s = PyString_AS_STRING(*repr) + n;
+	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
+	    PyErr_SetFromWindowsErrWithFilename(0, NULL);
+	    return -1;
+	}
    }
+
+    return 0;
+}
+
+PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
+				Py_ssize_t size,
+				const char *errors)
+{
+    PyObject *repr = NULL;
+    int ret;
+
+#ifdef NEED_RETRY
+ retry:
+    if (size > INT_MAX)
+	ret = encode_mbcs(&repr, p, INT_MAX);
+    else
+#endif
+	ret = encode_mbcs(&repr, p, (int)size);
+
+    if (ret < 0) {
+	Py_XDECREF(repr);
+	return NULL;
+    }
+
+#ifdef NEED_RETRY
+    if (size > INT_MAX) {
+	p += INT_MAX;
+	size -= INT_MAX;
+	goto retry;
+    }
+#endif
+
    return repr;
 }

@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
 				NULL);
 }

+#undef NEED_RETRY
+
 #endif /* MS_WINDOWS */

 /* --- Character Mapping Codec -------------------------------------------- */