Merged revisions 72283-72284 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r72283 | antoine.pitrou | 2009-05-04 20:32:32 +0200 (lun., 04 mai 2009) | 4 lines Issue #4426: The UTF-7 decoder was too strict and didn't accept some legal sequences. Patch by Nick Barnes and Victor Stinner. ........ r72284 | antoine.pitrou | 2009-05-04 20:32:50 +0200 (lun., 04 mai 2009) | 3 lines Add Nick Barnes to ACKS. ........

Merged revisions 72283-72284 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r72283 | antoine.pitrou | 2009-05-04 20:32:32 +0200 (lun., 04 mai 2009) | 4 lines Issue #4426: The UTF-7 decoder was too strict and didn't accept some legal sequences. Patch by Nick Barnes and Victor Stinner. ........ r72284 | antoine.pitrou | 2009-05-04 20:32:50 +0200 (lun., 04 mai 2009) | 3 lines Add Nick Barnes to ACKS. ........
244651aa · Antoine Pitrou · 375c0197 · 244651aa · 244651aa · 244651aa
Commit 244651aa authored May 04, 2009 by Antoine Pitrou
5 changed files
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -858,10 +858,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
 PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF7(
    const Py_UNICODE *data, 	/* Unicode char buffer */
    Py_ssize_t length,	 	/* number of Py_UNICODE chars to encode */
-    int encodeSetO,             /* force the encoder to encode characters in
+    int base64SetO,		/* Encode RFC2152 Set O characters in base64 */
-                                   Set O, as described in RFC2152 */
+    int base64WhiteSpace,	/* Encode whitespace (sp, ht, nl, cr) in base64 */
-    int encodeWhiteSpace,       /* force the encoder to encode space, tab,
-                                   carriage return and linefeed characters */
    const char *errors		/* error handling */
    );

--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -867,19 +867,31 @@ class UnicodeTest(
            ('+?', b'+-?'),
            (r'\\?', b'+AFwAXA?'),
            (r'\\\?', b'+AFwAXABc?'),
-            (r'++--', b'+-+---')
+            (r'++--', b'+-+---'),
+            ('\U000abcde', b'+2m/c3g-'),                  # surrogate pairs
+            ('/', b'/'),
        ]
        for (x, y) in utfTests:
            self.assertEqual(x.encode('utf-7'), y)
-        # surrogates not supported
+        # Unpaired surrogates not supported
        self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7')
-        self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd')
+        self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd')
        # Issue #2242: crash on some Windows/MSVC versions
-        self.assertRaises(UnicodeDecodeError, b'+\xc1'.decode, 'utf-7')
+        self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
+        # Direct encoded characters
+        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
+        # Optional direct characters
+        set_o = '!"#$%&*;<=>@[]^_`{|}'
+        for c in set_d:
+            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
+            self.assertEqual(c.encode('ascii').decode('utf7'), c)
+        for c in set_o:
+            self.assertEqual(c.encode('ascii').decode('utf7'), c)
    def test_codecs_utf8(self):
        self.assertEqual(''.encode('utf-8'), b'')

--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -35,6 +35,7 @@ Luigi Ballabio
 Jeff Balogh
 Michael J. Barber
 Chris Barker
+Nick Barnes
 Quentin Barnes
 Richard Barran
 Cesar Eduardo Barros

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,9 @@ What's New in Python 3.1 beta 1?
 Core and Builtins
 -----------------
+- Issue #4426: The UTF-7 decoder was too strict and didn't accept some legal
+  sequences. Patch by Nick Barnes and Victor Stinner.
 - Issue #3672: Reject surrogates in utf-8 codec; add surrogates error handler.
 - Issue #5883: In the io module, the BufferedIOBase and TextIOBase ABCs have

--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1702,69 +1702,84 @@ int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler
 /* --- UTF-7 Codec -------------------------------------------------------- */
-/* see RFC2152 for details */
+/* See RFC2152 for details.  We encode conservatively and decode liberally. */
-static
+/* Three simple macros defining base-64. */
-char utf7_special[128] = {
-    /* indicate whether a UTF-7 character is special i.e. cannot be directly
-       encoded:
-       0 - not special
-       1 - special
-       2 - whitespace (optional)
-       3 - RFC2152 Set O (optional) */
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
-    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
-    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
-};
+/* Is c a base-64 character? */
+#define IS_BASE64(c) \
+    (((c) >= 'A' && (c) <= 'Z') ||     \
+     ((c) >= 'a' && (c) <= 'z') ||     \
+     ((c) >= '0' && (c) <= '9') ||     \
+     (c) == '+' || (c) == '/')
-/* Note: The comparison (c) <= 0 is a trick to work-around gcc
+/* given that c is a base-64 character, what is its base-64 value? */
-   warnings about the comparison always being false; since
-   utf7_special[0] is 1, we can safely make that one comparison
-   true  */
-#define SPECIAL(c, encodeO, encodeWS)                   \
+#define FROM_BASE64(c)                                                  \
-    ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
+    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
-     (encodeWS && (utf7_special[(c)] == 2)) ||          \
+     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
-     (encodeO && (utf7_special[(c)] == 3)))
+     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
+     (c) == '+' ? 62 : 63)
-#define B64(n)                                                          \
+/* What is the base-64 character of the bottom 6 bits of n? */
+#define TO_BASE64(n)  \
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
-#define B64CHAR(c)                              \
-    (ISALNUM(c) || (c) == '+' || (c) == '/')
+/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
-#define UB64(c)                                         \
+ * decoded as itself.  We are permissive on decoding; the only ASCII
-    ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ?   \
+ * byte not decoding to itself is the + which begins a base64
-     (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
+ * string. */
-#define ENCODE(out, ch, bits)                   \
+#define DECODE_DIRECT(c)                                \
-    while (bits >= 6) {                         \
+    ((c) <= 127 && (c) != '+')
-        *out++ = B64(ch >> (bits-6));           \
-        bits -= 6;                              \
+/* The UTF-7 encoder treats ASCII characters differently according to
-    }
+ * whether they are Set D, Set O, Whitespace, or special (i.e. none of
+ * the above).  See RFC2152.  This array identifies these different
-#define DECODE(out, ch, bits, surrogate)                                \
+ * sets:
-    while (bits >= 16) {                                                \
+ * 0 : "Set D"
-        Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff);   \
+ *     alphanumeric and '(),-./:?
-        bits -= 16;                                                     \
+ * 1 : "Set O"
-        if (surrogate) {                                                \
+ *     !"#$%&*;<=>@[]^_`{|}
-            /* We have already generated an error for the high surrogate \
+ * 2 : "whitespace"
-               so let's not bother seeing if the low surrogate is correct or not */ \
+ *     ht nl cr sp
-            surrogate = 0;                                              \
+ * 3 : special (must be base64 encoded)
-        } else if (0xDC00 <= outCh && outCh <= 0xDFFF) {                \
+ *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
-            /* This is a surrogate pair. Unfortunately we can't represent \
+ */
-               it in a 16-bit character */                              \
-            surrogate = 1;                                              \
+static
-            errmsg = "code pairs are not supported";                    \
+char utf7_category[128] = {
-            goto utf7Error;                                             \
+/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
-        } else {                                                        \
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
-            *out++ = outCh;                                             \
+/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
-        }                                                               \
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-    }
+/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
+    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
+/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
+/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
+/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
+};
+/* ENCODE_DIRECT: this character should be encoded as itself.  The
+ * answer depends on whether we are encoding set O as itself, and also
+ * on whether we are encoding whitespace as itself.  RFC2152 makes it
+ * clear that the answers to these questions vary between
+ * applications, so this code needs to be flexible.  */
+#define ENCODE_DIRECT(c, directO, directWS)             \
+    ((c) < 128 && (c) > 0 &&                            \
+     ((utf7_category[(c)] == 0) ||                      \
+      (directWS && (utf7_category[(c)] == 2)) ||        \
+      (directO && (utf7_category[(c)] == 1))))
 PyObject *PyUnicode_DecodeUTF7(const char *s,
                               Py_ssize_t size,
@@ -1773,6 +1788,13 @@ PyObject *PyUnicode_DecodeUTF7(const char *s,
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
 }
+/* The decoder.  The only state we preserve is our read position,
+ * i.e. how many characters we have consumed.  So if we end in the
+ * middle of a shift sequence we have to back off the read position
+ * and the output to the beginning of the sequence, otherwise we lose
+ * all the shift state (seen bits, number of bits seen, high
+ * surrogate). */
 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
                                       Py_ssize_t size,
                                       const char *errors,
@@ -1787,9 +1809,10 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
    Py_UNICODE *p;
    const char *errmsg = "";
    int inShift = 0;
-    unsigned int bitsleft = 0;
+    Py_UNICODE *shiftOutStart;
-    unsigned long charsleft = 0;
+    unsigned int base64bits = 0;
-    int surrogate = 0;
+    unsigned long base64buffer = 0;
+    Py_UNICODE surrogate = 0;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
@@ -1803,6 +1826,7 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
    }
    p = unicode->str;
+    shiftOutStart = p;
    e = s + size;
    while (s < e) {
@@ -1810,72 +1834,101 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
      restart:
        ch = (unsigned char) *s;
-        if (inShift) {
+        if (inShift) { /* in a base-64 section */
-            if ((ch == '-') || !B64CHAR(ch)) {
+            if (IS_BASE64(ch)) { /* consume a base-64 character */
-                inShift = 0;
+                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
+                base64bits += 6;
                s++;
+                if (base64bits >= 16) {
-                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
+                    /* we have enough bits for a UTF-16 value */
-                if (bitsleft >= 6) {
+                    Py_UNICODE outCh = (Py_UNICODE)
-                    /* The shift sequence has a partial character in it. If
+                                       (base64buffer >> (base64bits-16));
-                       bitsleft < 6 then we could just classify it as padding
+                    base64bits -= 16;
-                       but that is not the case here */
+                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
+                    if (surrogate) {
-                    errmsg = "partial character in shift sequence";
+                        /* expecting a second surrogate */
-                    goto utf7Error;
+                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+#ifdef Py_UNICODE_WIDE
+                            *p++ = (((surrogate & 0x3FF)<<10)
+                                    | (outCh & 0x3FF)) + 0x10000;
+#else
+                            *p++ = surrogate;
+                            *p++ = outCh;
+#endif
+                            surrogate = 0;
+                        }
+                        else {
+                            surrogate = 0;
+                            errmsg = "second surrogate missing";
+                            goto utf7Error;
+                        }
+                    }
+                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+                        /* first surrogate */
+                        surrogate = outCh;
+                    }
+                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
+                        errmsg = "unexpected second surrogate";
+                        goto utf7Error;
+                    }
+                    else {
+                        *p++ = outCh;
+                    }
                }
-                /* According to RFC2152 the remaining bits should be zero. We
+            }
-                   choose to signal an error/insert a replacement character
+            else { /* now leaving a base-64 section */
-                   here so indicate the potential of a misencoded character. */
+                inShift = 0;
+                s++;
-                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
+                if (surrogate) {
-                if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
+                    errmsg = "second surrogate missing at end of shift sequence";
-                    errmsg = "non-zero padding bits in shift sequence";
                    goto utf7Error;
                }
+                if (base64bits > 0) { /* left-over bits */
-                if (ch == '-') {
+                    if (base64bits >= 6) {
-                    if ((s < e) && (*(s) == '-')) {
+                        /* We've seen at least one base-64 character */
-                        *p++ = '-';
+                        errmsg = "partial character in shift sequence";
-                        inShift = 1;
+                        goto utf7Error;
                    }
-                } else if (SPECIAL(ch,0,0)) {
+                    else {
-                    errmsg = "unexpected special character";
+                        /* Some bits remain; they should be zero */
-                    goto utf7Error;
+                        if (base64buffer != 0) {
-                } else  {
+                            errmsg = "non-zero padding bits in shift sequence";
+                            goto utf7Error;
+                        }
+                    }
+                }
+                if (ch != '-') {
+                    /* '-' is absorbed; other terminating
+                       characters are preserved */
                    *p++ = ch;
                }
-            } else {
-                charsleft = (charsleft << 6) | UB64(ch);
-                bitsleft += 6;
-                s++;
-                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
            }
        }
        else if ( ch == '+' ) {
            startinpos = s-starts;
-            s++;
+            s++; /* consume '+' */
-            if (s < e && *s == '-') {
+            if (s < e && *s == '-') { /* '+-' encodes '+' */
                s++;
                *p++ = '+';
-            } else
+            }
-            {
+            else { /* begin base64-encoded section */
                inShift = 1;
-                bitsleft = 0;
+                shiftOutStart = p;
+                base64bits = 0;
            }
        }
-        else if (SPECIAL(ch,0,0)) {
+        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
-            startinpos = s-starts;
+            *p++ = ch;
-            errmsg = "unexpected special character";
            s++;
-            goto utf7Error;
        }
        else {
-            *p++ = ch;
+            startinpos = s-starts;
            s++;
+            errmsg = "unexpected special character";
+            goto utf7Error;
        }
        continue;
-      utf7Error:
+utf7Error:
        outpos = p-PyUnicode_AS_UNICODE(unicode);
        endinpos = s-starts;
        if (unicode_decode_call_errorhandler(
@@ -1886,23 +1939,35 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
            goto onError;
    }
-    if (inShift && !consumed) {
+    /* end of string */
-        outpos = p-PyUnicode_AS_UNICODE(unicode);
-        endinpos = size;
+    if (inShift && !consumed) { /* in shift sequence, no more to follow */
-        if (unicode_decode_call_errorhandler(
+        /* if we're in an inconsistent state, that's an error */
-                errors, &errorHandler,
+        if (surrogate ||
-                "utf7", "unterminated shift sequence",
+                (base64bits >= 6) ||
-                &starts, &e, &startinpos, &endinpos, &exc, &s,
+                (base64bits > 0 && base64buffer != 0)) {
-                &unicode, &outpos, &p))
+            outpos = p-PyUnicode_AS_UNICODE(unicode);
-            goto onError;
+            endinpos = size;
-        if (s < e)
+            if (unicode_decode_call_errorhandler(
-            goto restart;
+                    errors, &errorHandler,
+                    "utf7", "unterminated shift sequence",
+                    &starts, &e, &startinpos, &endinpos, &exc, &s,
+                    &unicode, &outpos, &p))
+                goto onError;
+            if (s < e)
+                goto restart;
+        }
    }
+    /* return state */
    if (consumed) {
-        if(inShift)
+        if (inShift) {
+            p = shiftOutStart; /* back off output */
            *consumed = startinpos;
-        else
+        }
+        else {
            *consumed = s-starts;
+        }
    }
    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
@@ -1922,27 +1987,27 @@ PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
                               Py_ssize_t size,
-                               int encodeSetO,
+                               int base64SetO,
-                               int encodeWhiteSpace,
+                               int base64WhiteSpace,
                               const char *errors)
 {
    PyObject *v;
    /* It might be possible to tighten this worst case */
-    Py_ssize_t cbAllocated = 5 * size;
+    Py_ssize_t allocated = 5 * size;
    int inShift = 0;
    Py_ssize_t i = 0;
-    unsigned int bitsleft = 0;
+    unsigned int base64bits = 0;
-    unsigned long charsleft = 0;
+    unsigned long base64buffer = 0;
    char * out;
    char * start;
    if (size == 0)
        return PyBytes_FromStringAndSize(NULL, 0);
-    if (cbAllocated / 5 != size)
+    if (allocated / 5 != size)
        return PyErr_NoMemory();
-    v = PyBytes_FromStringAndSize(NULL, cbAllocated);
+    v = PyBytes_FromStringAndSize(NULL, allocated);
    if (v == NULL)
        return NULL;
@@ -1950,78 +2015,76 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
    for (;i < size; ++i) {
        Py_UNICODE ch = s[i];
-        if (!inShift) {
+        if (inShift) {
-            if (ch == '+') {
+            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
-                *out++ = '+';
+                /* shifting out */
-                *out++ = '-';
+                if (base64bits) { /* output remaining bits */
-            } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
+                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
-                charsleft = ch;
+                    base64buffer = 0;
-                bitsleft = 16;
+                    base64bits = 0;
-                *out++ = '+';
+                }
-                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
+                inShift = 0;
-                inShift = bitsleft > 0;
-            } else {
-                *out++ = (char) ch;
-            }
-        } else {
-            if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
-                *out++ = B64(charsleft << (6-bitsleft));
-                charsleft = 0;
-                bitsleft = 0;
                /* Characters not in the BASE64 set implicitly unshift the sequence
                   so no '-' is required, except if the character is itself a '-' */
-                if (B64CHAR(ch) || ch == '-') {
+                if (IS_BASE64(ch) || ch == '-') {
                    *out++ = '-';
                }
-                inShift = 0;
                *out++ = (char) ch;
-            } else {
+            }
-                bitsleft += 16;
+            else {
-                charsleft = (charsleft << 16) | ch;
+                goto encode_char;
-                /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
+            }
+        }
-                /* If the next character is special then we don't need to terminate
+        else { /* not in a shift sequence */
-                   the shift sequence. If the next character is not a BASE64 character
+            if (ch == '+') {
-                   or '-' then the shift sequence will be terminated implicitly and we
+                *out++ = '+';
-                   don't have to insert a '-'. */
-                if (bitsleft == 0) {
-                    if (i + 1 < size) {
-                        Py_UNICODE ch2 = s[i+1];
-                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
-                        } else if (B64CHAR(ch2) || ch2 == '-') {
-                            *out++ = '-';
-                            inShift = 0;
-                        } else {
-                            inShift = 0;
-                        }
-                    }
-                    else {
                        *out++ = '-';
-                        inShift = 0;
+            }
-                    }
+            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
-                }
+                *out++ = (char) ch;
+            }
+            else {
+                *out++ = '+';
+                inShift = 1;
+                goto encode_char;
            }
        }
+        continue;
+encode_char:
+#ifdef Py_UNICODE_WIDE
+        if (ch >= 0x10000) {
+            /* code first surrogate */
+            base64bits += 16;
+            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
+            while (base64bits >= 6) {
+                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+                base64bits -= 6;
+            }
+            /* prepare second surrogate */
+            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
+        }
+#endif
+        base64bits += 16;
+        base64buffer = (base64buffer << 16) | ch;
+        while (base64bits >= 6) {
+            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
+            base64bits -= 6;
+        }
    }
-    if (bitsleft) {
+    if (base64bits)
-        *out++= B64(charsleft << (6-bitsleft) );
+        *out++= TO_BASE64(base64buffer << (6-base64bits) );
+    if (inShift)
        *out++ = '-';
-    }
    if (_PyBytes_Resize(&v, out - start) < 0)
        return NULL;
    return v;
 }
-#undef SPECIAL
+#undef IS_BASE64
-#undef B64
+#undef FROM_BASE64
-#undef B64CHAR
+#undef TO_BASE64
-#undef UB64
+#undef DECODE_DIRECT
-#undef ENCODE
+#undef ENCODE_DIRECT
-#undef DECODE
 /* --- UTF-8 Codec -------------------------------------------------------- */