Issue #8922: Normalize the encoding name in PyUnicode_AsEncodedString() to

enable shortcuts for upper case encoding name. Add also a shortcut for "iso-8859-1" in PyUnicode_AsEncodedString() and PyUnicode_Decode().

Issue #8922: Normalize the encoding name in PyUnicode_AsEncodedString() to
enable shortcuts for upper case encoding name. Add also a shortcut for "iso-8859-1" in PyUnicode_AsEncodedString() and PyUnicode_Decode().
ecccd288 · Victor Stinner · 1e2bfb77 · ecccd288 · ecccd288
Commit ecccd288 authored Jun 10, 2010 by Victor Stinner
Show whitespace changes
Inline Side-by-side

Showing with 35 additions and 18 deletions

Misc/NEWS Misc/NEWS +4 -0

Objects/unicodeobject.c Objects/unicodeobject.c +31 -18

No files found.
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,10 @@ What's New in Python 3.2 Alpha 1?
 Core and Builtins
 -----------------
+- Issue #8922: Normalize the encoding name in PyUnicode_AsEncodedString() to
+  enable shortcuts for upper case encoding name. Add also a shortcut for
+  "iso-8859-1" in PyUnicode_AsEncodedString() and PyUnicode_Decode().
 - Issue #8838: Remove codecs.charbuffer_encode() function. The buffer protocol
  doesn't support "char buffer" anymore in Python3.

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1293,25 +1293,21 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
    return NULL;
 }
-PyObject *PyUnicode_Decode(const char *s,
+/* Convert encoding to lower case and replace '_' with '-' in order to
-                           Py_ssize_t size,
+   catch e.g. UTF_8. Truncate the string if it is longer than lower_len-1
-                           const char *encoding,
+   characters. */
-                           const char *errors)
+static void normalize_encoding(const char *encoding, 
+                               char *lower, 
+                               size_t lower_len)
 {
-    PyObject *buffer = NULL, *unicode;
-    Py_buffer info;
-    char lower[20];  /* Enough for any encoding name we recognize */
-    char *l;
    const char *e;
+    char *l;
+    char *l_end;
-    if (encoding == NULL)
-        encoding = PyUnicode_GetDefaultEncoding();
-    /* Convert encoding to lower case and replace '_' with '-' in order to
-       catch e.g. UTF_8 */
    e = encoding;
    l = lower;
-    while (*e && l < &lower[(sizeof lower) - 2]) {
+    l_end = &lower[lower_len - 1];
+    while (*e && l < l_end) {
        if (ISUPPER(*e)) {
            *l++ = TOLOWER(*e++);
        }
@@ -1324,8 +1320,22 @@ PyObject *PyUnicode_Decode(const char *s,
        }
    }
    *l = '\0';
+}
+PyObject *PyUnicode_Decode(const char *s,
+                           Py_ssize_t size,
+                           const char *encoding,
+                           const char *errors)
+{
+    PyObject *buffer = NULL, *unicode;
+    Py_buffer info;
+    char lower[11];  /* Enough for any encoding shortcut */
+    if (encoding == NULL)
+        encoding = PyUnicode_GetDefaultEncoding();
    /* Shortcuts for common default encodings */
+    normalize_encoding(encoding, lower, sizeof(lower));
    if (strcmp(lower, "utf-8") == 0)
        return PyUnicode_DecodeUTF8(s, size, errors);
    else if ((strcmp(lower, "latin-1") == 0) ||
@@ -1478,6 +1488,7 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
                                    const char *errors)
 {
    PyObject *v;
+    char lower[11];  /* Enough for any encoding shortcut */
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
@@ -1488,21 +1499,23 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
        encoding = PyUnicode_GetDefaultEncoding();
    /* Shortcuts for common default encodings */
-    if (strcmp(encoding, "utf-8") == 0)
+    normalize_encoding(encoding, lower, sizeof(lower));
+    if (strcmp(lower, "utf-8") == 0)
        return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
                                    PyUnicode_GET_SIZE(unicode),
                                    errors);
-    else if (strcmp(encoding, "latin-1") == 0)
+    else if ((strcmp(lower, "latin-1") == 0) ||
+             (strcmp(lower, "iso-8859-1") == 0))
        return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
                                      PyUnicode_GET_SIZE(unicode),
                                      errors);
 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
-    else if (strcmp(encoding, "mbcs") == 0)
+    else if (strcmp(lower, "mbcs") == 0)
        return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
                                    PyUnicode_GET_SIZE(unicode),
                                    errors);
 #endif
-    else if (strcmp(encoding, "ascii") == 0)
+    else if (strcmp(lower, "ascii") == 0)
        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
                                     PyUnicode_GET_SIZE(unicode),
                                     errors);