Commit d825143b authored by Martin v. Löwis's avatar Martin v. Löwis

Patch #1455898: Incremental mode for "mbcs" codec.

parent 6ce9fe88
......@@ -1431,6 +1431,18 @@ machine running the codec.
raised by the codec.
\end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s,
int size,
const char *errors,
int *consumed}
If \var{consumed} is \NULL{}, behave like
\cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{},
\cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead
byte and the number of bytes that have been decoded will be stored in
\var{consumed}.
\versionadded{2.5}
\end{cfuncdesc}
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
Py_ssize_t size,
const char *errors}
......
......@@ -938,6 +938,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
const char *errors /* error handling */
);
PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
const char *string, /* MBCS encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed /* bytes consumed */
);
PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
PyObject *unicode /* Unicode object */
);
......
......@@ -22,9 +22,10 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
def encode(self, input, final=False):
return codecs.mbcs_encode(input,self.errors)[0]
class IncrementalDecoder(codecs.IncrementalDecoder):
def decode(self, input, final=False):
return codecs.mbcs_decode(input,self.errors)[0]
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
return codecs.mbcs_decode(input,self.errors,final)
class StreamWriter(Codec,codecs.StreamWriter):
pass
......
......@@ -156,6 +156,9 @@ Extension Modules
Library
-------
- Patch #1455898: The MBCS codec now supports the incremental mode for
double-byte encodings.
- ``difflib``'s ``SequenceMatcher.get_matching_blocks()`` was changed to
guarantee that adjacent triples in the return list always describe
non-adjacent blocks. Previously, a pair of matching blocks could end
......
......@@ -479,15 +479,20 @@ mbcs_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
Py_ssize_t size, consumed;
const char *errors = NULL;
int final = 1;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
&data, &size, &errors))
if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode",
&data, &size, &errors, &final))
return NULL;
return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
size);
decoded = PyUnicode_DecodeMBCSStateful(
data, size, errors, final ? NULL : &consumed);
if (!decoded)
return NULL;
return codec_tuple(decoded, final ? size : consumed);
}
#endif /* MS_WINDOWS */
......
......@@ -2820,65 +2820,199 @@ PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
/* --- MBCS codecs for Windows -------------------------------------------- */
PyObject *PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
#if SIZEOF_INT < SIZEOF_SSIZE_T
#define NEED_RETRY
#endif
/* XXX This code is limited to "true" double-byte encodings, as
a) it assumes an incomplete character consists of a single byte, and
b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
encodings, see IsDBCSLeadByteEx documentation. */
static int is_dbcs_lead_byte(const char *s, int offset)
{
const char *curr = s + offset;
if (IsDBCSLeadByte(*curr)) {
const char *prev = CharPrev(s, curr);
return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
}
return 0;
}
/*
* Decode MBCS string into unicode object. If 'final' is set, converts
* trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
*/
static int decode_mbcs(PyUnicodeObject **v,
const char *s, /* MBCS string */
int size, /* sizeof MBCS string */
int final)
{
PyUnicodeObject *v;
Py_UNICODE *p;
DWORD usize;
Py_ssize_t n = 0;
int usize = 0;
assert(size >= 0);
/* Skip trailing lead-byte unless 'final' is set */
if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
--size;
/* First get the size of the result */
assert(size < INT_MAX);
usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0);
if (size > 0 && usize==0)
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
if (size > 0) {
usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
if (usize == 0) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
}
v = _PyUnicode_New(usize);
if (v == NULL)
return NULL;
if (usize == 0)
return (PyObject *)v;
p = PyUnicode_AS_UNICODE(v);
if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) {
Py_DECREF(v);
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
if (*v == NULL) {
/* Create unicode object */
*v = _PyUnicode_New(usize);
if (*v == NULL)
return -1;
}
else {
/* Extend unicode object */
n = PyUnicode_GET_SIZE(*v);
if (_PyUnicode_Resize(v, n + usize) < 0)
return -1;
}
/* Do the conversion */
if (size > 0) {
p = PyUnicode_AS_UNICODE(*v) + n;
if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
}
return size;
}
PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
PyUnicodeObject *v = NULL;
int done;
if (consumed)
*consumed = 0;
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
done = decode_mbcs(&v, s, INT_MAX, 0);
else
#endif
done = decode_mbcs(&v, s, (int)size, !consumed);
if (done < 0) {
Py_XDECREF(v);
return NULL;
}
if (consumed)
*consumed += done;
#ifdef NEED_RETRY
if (size > INT_MAX) {
s += done;
size -= done;
goto retry;
}
#endif
return (PyObject *)v;
}
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
PyObject *PyUnicode_DecodeMBCS(const char *s,
Py_ssize_t size,
const char *errors)
{
PyObject *repr;
char *s;
DWORD mbcssize;
return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
}
/* If there are no characters, bail now! */
if (size==0)
return PyString_FromString("");
/*
* Convert unicode into string object (MBCS).
* Returns 0 if succeed, -1 otherwise.
*/
static int encode_mbcs(PyObject **repr,
const Py_UNICODE *p, /* unicode */
int size) /* size of unicode */
{
int mbcssize = 0;
Py_ssize_t n = 0;
assert(size >= 0);
/* First get the size of the result */
assert(size<INT_MAX);
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, (int)size, NULL, 0, NULL, NULL);
if (mbcssize==0)
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
if (size > 0) {
mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
if (mbcssize == 0) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
}
repr = PyString_FromStringAndSize(NULL, mbcssize);
if (repr == NULL)
return NULL;
if (mbcssize == 0)
return repr;
if (*repr == NULL) {
/* Create string object */
*repr = PyString_FromStringAndSize(NULL, mbcssize);
if (*repr == NULL)
return -1;
}
else {
/* Extend string object */
n = PyString_Size(*repr);
if (_PyString_Resize(repr, n + mbcssize) < 0)
return -1;
}
/* Do the conversion */
s = PyString_AS_STRING(repr);
assert(size < INT_MAX);
if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) {
Py_DECREF(repr);
return PyErr_SetFromWindowsErrWithFilename(0, NULL);
if (size > 0) {
char *s = PyString_AS_STRING(*repr) + n;
if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
PyErr_SetFromWindowsErrWithFilename(0, NULL);
return -1;
}
}
return 0;
}
PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Py_ssize_t size,
const char *errors)
{
PyObject *repr = NULL;
int ret;
#ifdef NEED_RETRY
retry:
if (size > INT_MAX)
ret = encode_mbcs(&repr, p, INT_MAX);
else
#endif
ret = encode_mbcs(&repr, p, (int)size);
if (ret < 0) {
Py_XDECREF(repr);
return NULL;
}
#ifdef NEED_RETRY
if (size > INT_MAX) {
p += INT_MAX;
size -= INT_MAX;
goto retry;
}
#endif
return repr;
}
......@@ -2893,6 +3027,8 @@ PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
NULL);
}
#undef NEED_RETRY
#endif /* MS_WINDOWS */
/* --- Character Mapping Codec -------------------------------------------- */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment