unicodeobject.c 316 KB
Newer Older
1
/*
2 3

Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 6
Unicode Integration Proposal (see file Misc/unicode.txt).

7 8 9
Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.

10
Copyright (c) Corporation for National Research Initiatives.
11

12 13 14
--------------------------------------------------------------------
The original string type implementation is:

Benjamin Peterson's avatar
Benjamin Peterson committed
15 16
  Copyright (c) 1999 by Secret Labs AB
  Copyright (c) 1999 by Fredrik Lundh
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38

By obtaining, using, and/or copying this software and/or its
associated documentation, you agree that you have read, understood,
and will comply with the following terms and conditions:

Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies, and that both that copyright notice and this permission notice
appear in supporting documentation, and that the name of Secret Labs
AB or the author not be used in advertising or publicity pertaining to
distribution of the software without specific, written prior
permission.

SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--------------------------------------------------------------------
39

40
*/
41

42
#define PY_SSIZE_T_CLEAN
43
#include "Python.h"
44
#include "ucnhash.h"
45

46
#ifdef MS_WINDOWS
47 48
#include <windows.h>
#endif
Guido van Rossum's avatar
Guido van Rossum committed
49

50 51
/* Limit for the Unicode object free list */

Christian Heimes's avatar
Christian Heimes committed
52
#define PyUnicode_MAXFREELIST       1024
53 54 55 56 57

/* Limit for the Unicode object free list stay alive optimization.

   The implementation will keep allocated Unicode memory intact for
   all objects on the free list having a size less than this
58
   limit. This reduces malloc() overhead for small Unicode objects.
59

Christian Heimes's avatar
Christian Heimes committed
60
   At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossum's avatar
Guido van Rossum committed
61
   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
62 63 64 65
   malloc()-overhead) bytes of unused garbage.

   Setting the limit to 0 effectively turns the feature off.

Guido van Rossum's avatar
Guido van Rossum committed
66 67
   Note: This is an experimental feature ! If you get core dumps when
   using Unicode objects, turn this feature off.
68 69 70

*/

Guido van Rossum's avatar
Guido van Rossum committed
71
#define KEEPALIVE_SIZE_LIMIT       9
72 73 74 75 76 77 78 79 80

/* Endianness switches; defaults to little endian */

#ifdef WORDS_BIGENDIAN
# define BYTEORDER_IS_BIG_ENDIAN
#else
# define BYTEORDER_IS_LITTLE_ENDIAN
#endif

81 82 83 84 85 86
/* --- Globals ------------------------------------------------------------

   The globals are initialized by the _PyUnicode_Init() API and should
   not be used before calling that API.

*/
87

88 89 90 91 92

#ifdef __cplusplus
extern "C" {
#endif

93 94 95 96 97 98
/* This dictionary holds all interned unicode strings.  Note that references
   to strings in this dictionary are *not* counted in the string's ob_refcnt.
   When the interned string reaches a refcnt of 0 the string deallocation
   function will delete the reference from this dictionary.

   Another way to look at this is that to say that the actual reference
99
   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
100 101 102
*/
static PyObject *interned;

103
/* Free list for Unicode objects */
Christian Heimes's avatar
Christian Heimes committed
104 105
static PyUnicodeObject *free_list;
static int numfree;
106

107 108 109 110 111 112 113
/* The empty Unicode object is shared to improve performance. */
static PyUnicodeObject *unicode_empty;

/* Single character Unicode strings in the Latin-1 range are being
   shared as well. */
static PyUnicodeObject *unicode_latin1[256];

114 115
/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
116
    0, 0, 0, 0, 0, 0, 0, 0,
117
/*     case 0x0009: * CHARACTER TABULATION */
118
/*     case 0x000A: * LINE FEED */
119
/*     case 0x000B: * LINE TABULATION */
120 121
/*     case 0x000C: * FORM FEED */
/*     case 0x000D: * CARRIAGE RETURN */
122 123
    0, 1, 1, 1, 1, 1, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
124 125 126 127
/*     case 0x001C: * FILE SEPARATOR */
/*     case 0x001D: * GROUP SEPARATOR */
/*     case 0x001E: * RECORD SEPARATOR */
/*     case 0x001F: * UNIT SEPARATOR */
128
    0, 0, 0, 0, 1, 1, 1, 1,
129
/*     case 0x0020: * SPACE */
130 131 132 133 134 135 136 137 138 139 140 141 142
    1, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,

    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0
143 144
};

145 146 147 148 149
static PyObject *unicode_encode_call_errorhandler(const char *errors,
       PyObject **errorHandler,const char *encoding, const char *reason,
       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);

150 151 152 153 154 155
static void raise_encode_exception(PyObject **exceptionObject,
                                   const char *encoding,
                                   const Py_UNICODE *unicode, Py_ssize_t size,
                                   Py_ssize_t startpos, Py_ssize_t endpos,
                                   const char *reason);

156 157
/* Same for linebreaks */
static unsigned char ascii_linebreak[] = {
158
    0, 0, 0, 0, 0, 0, 0, 0,
159
/*         0x000A, * LINE FEED */
160 161
/*         0x000B, * LINE TABULATION */
/*         0x000C, * FORM FEED */
162
/*         0x000D, * CARRIAGE RETURN */
163
    0, 0, 1, 1, 1, 1, 0, 0,
164
    0, 0, 0, 0, 0, 0, 0, 0,
165 166 167
/*         0x001C, * FILE SEPARATOR */
/*         0x001D, * GROUP SEPARATOR */
/*         0x001E, * RECORD SEPARATOR */
168 169 170 171 172 173 174 175 176 177 178 179 180 181
    0, 0, 0, 0, 1, 1, 1, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,

    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0
182 183 184
};


185
Py_UNICODE
186
PyUnicode_GetMax(void)
187
{
188
#ifdef Py_UNICODE_WIDE
189
    return 0x10FFFF;
190
#else
191 192 193
    /* This is actually an illegal character, so it should
       not be passed to unichr. */
    return 0xFFFF;
194 195 196
#endif
}

197 198 199 200 201 202 203 204
/* --- Bloom Filters ----------------------------------------------------- */

/* stuff to implement simple "bloom filters" for Unicode characters.
   to keep things simple, we use a single bitmask, using the least 5
   bits from each unicode characters as the bit index. */

/* the linebreak mask is set up by Unicode_Init below */

205 206 207 208 209 210 211 212 213 214
#if LONG_BIT >= 128
#define BLOOM_WIDTH 128
#elif LONG_BIT >= 64
#define BLOOM_WIDTH 64
#elif LONG_BIT >= 32
#define BLOOM_WIDTH 32
#else
#error "LONG_BIT is smaller than 32"
#endif

215 216 217 218
#define BLOOM_MASK unsigned long

static BLOOM_MASK bloom_linebreak;

219 220
#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
221

Benjamin Peterson's avatar
Benjamin Peterson committed
222 223 224
#define BLOOM_LINEBREAK(ch)                                             \
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
225 226 227 228 229

Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
{
    /* calculate simple bloom-style bitmask for a given unicode string */

230
    BLOOM_MASK mask;
231 232 233 234
    Py_ssize_t i;

    mask = 0;
    for (i = 0; i < len; i++)
235
        BLOOM_ADD(mask, ptr[i]);
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250

    return mask;
}

Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
{
    Py_ssize_t i;

    for (i = 0; i < setlen; i++)
        if (set[i] == chr)
            return 1;

    return 0;
}

Benjamin Peterson's avatar
Benjamin Peterson committed
251
#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
252 253
    BLOOM(mask, chr) && unicode_member(chr, set, setlen)

254 255 256
/* --- Unicode Object ----------------------------------------------------- */

static
257
int unicode_resize(register PyUnicodeObject *unicode,
Benjamin Peterson's avatar
Benjamin Peterson committed
258
                   Py_ssize_t length)
259 260
{
    void *oldstr;
261

Guido van Rossum's avatar
Guido van Rossum committed
262
    /* Shortcut if there's nothing much to do. */
263
    if (unicode->length == length)
Benjamin Peterson's avatar
Benjamin Peterson committed
264
        goto reset;
265

266 267 268
    /* Resizing shared object (unicode_empty or single character
       objects) in-place is not allowed. Use PyUnicode_Resize()
       instead ! */
269

270
    if (unicode == unicode_empty ||
Benjamin Peterson's avatar
Benjamin Peterson committed
271 272 273
        (unicode->length == 1 &&
         unicode->str[0] < 256U &&
         unicode_latin1[unicode->str[0]] == unicode)) {
274
        PyErr_SetString(PyExc_SystemError,
275
                        "can't resize shared str objects");
276 277 278
        return -1;
    }

279 280 281 282 283
    /* We allocate one more byte to make sure the string is Ux0000 terminated.
       The overallocation is also used by fastsearch, which assumes that it's
       safe to look at str[length] (without making any assumptions about what
       it contains). */

284
    oldstr = unicode->str;
Christian Heimes's avatar
Christian Heimes committed
285
    unicode->str = PyObject_REALLOC(unicode->str,
Benjamin Peterson's avatar
Benjamin Peterson committed
286
                                    sizeof(Py_UNICODE) * (length + 1));
287
    if (!unicode->str) {
Benjamin Peterson's avatar
Benjamin Peterson committed
288
        unicode->str = (Py_UNICODE *)oldstr;
289 290 291 292
        PyErr_NoMemory();
        return -1;
    }
    unicode->str[length] = 0;
293
    unicode->length = length;
294

Benjamin Peterson's avatar
Benjamin Peterson committed
295
  reset:
296
    /* Reset the object caches */
297
    if (unicode->defenc) {
298
        Py_CLEAR(unicode->defenc);
299 300
    }
    unicode->hash = -1;
301

302 303 304 305
    return 0;
}

/* We allocate one more byte to make sure the string is
306 307
   Ux0000 terminated; some code (e.g. new_identifier)
   relies on that.
308 309

   XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson's avatar
Benjamin Peterson committed
310
   free list never reduces its size below 1.
311 312 313 314

*/

static
Martin v. Löwis's avatar
Martin v. Löwis committed
315
PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
316 317 318
{
    register PyUnicodeObject *unicode;

319
    /* Optimization for empty strings */
320 321 322 323 324
    if (length == 0 && unicode_empty != NULL) {
        Py_INCREF(unicode_empty);
        return unicode_empty;
    }

Neal Norwitz's avatar
Neal Norwitz committed
325 326 327 328 329
    /* Ensure we won't overflow the size. */
    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
        return (PyUnicodeObject *)PyErr_NoMemory();
    }

330
    /* Unicode freelist & memory allocation */
Christian Heimes's avatar
Christian Heimes committed
331 332 333 334
    if (free_list) {
        unicode = free_list;
        free_list = *(PyUnicodeObject **)unicode;
        numfree--;
Benjamin Peterson's avatar
Benjamin Peterson committed
335 336 337 338
        if (unicode->str) {
            /* Keep-Alive optimization: we only upsize the buffer,
               never downsize it. */
            if ((unicode->length < length) &&
339
                unicode_resize(unicode, length) < 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
340 341 342
                PyObject_DEL(unicode->str);
                unicode->str = NULL;
            }
343
        }
344
        else {
Benjamin Peterson's avatar
Benjamin Peterson committed
345 346
            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
347 348
        }
        PyObject_INIT(unicode, &PyUnicode_Type);
349 350
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
351
        size_t new_size;
Neil Schemenauer's avatar
Neil Schemenauer committed
352
        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
353 354
        if (unicode == NULL)
            return NULL;
Benjamin Peterson's avatar
Benjamin Peterson committed
355 356
        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
357 358
    }

Guido van Rossum's avatar
Guido van Rossum committed
359
    if (!unicode->str) {
Benjamin Peterson's avatar
Benjamin Peterson committed
360 361
        PyErr_NoMemory();
        goto onError;
Guido van Rossum's avatar
Guido van Rossum committed
362
    }
363
    /* Initialize the first element to guard against cases where
364 365 366 367 368 369
     * the caller fails before initializing str -- unicode_resize()
     * reads str[0], and the Keep-Alive optimization can keep memory
     * allocated for str alive across a call to unicode_dealloc(unicode).
     * We don't want unicode_resize to read uninitialized memory in
     * that case.
     */
370
    unicode->str[0] = 0;
371
    unicode->str[length] = 0;
372
    unicode->length = length;
373
    unicode->hash = -1;
374
    unicode->state = 0;
375
    unicode->defenc = NULL;
376
    return unicode;
377

Benjamin Peterson's avatar
Benjamin Peterson committed
378
  onError:
379 380
    /* XXX UNREF/NEWREF interface should be more symmetrical */
    _Py_DEC_REFTOTAL;
381
    _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer's avatar
Neil Schemenauer committed
382
    PyObject_Del(unicode);
383
    return NULL;
384 385 386
}

static
387
void unicode_dealloc(register PyUnicodeObject *unicode)
388
{
389
    switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
390 391
    case SSTATE_NOT_INTERNED:
        break;
392

Benjamin Peterson's avatar
Benjamin Peterson committed
393 394 395 396 397 398 399
    case SSTATE_INTERNED_MORTAL:
        /* revive dead object temporarily for DelItem */
        Py_REFCNT(unicode) = 3;
        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
            Py_FatalError(
                "deletion of interned string failed");
        break;
400

Benjamin Peterson's avatar
Benjamin Peterson committed
401 402
    case SSTATE_INTERNED_IMMORTAL:
        Py_FatalError("Immortal interned string died.");
403

Benjamin Peterson's avatar
Benjamin Peterson committed
404 405
    default:
        Py_FatalError("Inconsistent interned string state.");
406 407
    }

408
    if (PyUnicode_CheckExact(unicode) &&
Benjamin Peterson's avatar
Benjamin Peterson committed
409
        numfree < PyUnicode_MAXFREELIST) {
Guido van Rossum's avatar
Guido van Rossum committed
410
        /* Keep-Alive optimization */
Benjamin Peterson's avatar
Benjamin Peterson committed
411 412 413 414 415 416
        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
            PyObject_DEL(unicode->str);
            unicode->str = NULL;
            unicode->length = 0;
        }
        if (unicode->defenc) {
417
            Py_CLEAR(unicode->defenc);
Benjamin Peterson's avatar
Benjamin Peterson committed
418 419
        }
        /* Add to free list */
Christian Heimes's avatar
Christian Heimes committed
420 421 422
        *(PyUnicodeObject **)unicode = free_list;
        free_list = unicode;
        numfree++;
423 424
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
425 426 427
        PyObject_DEL(unicode->str);
        Py_XDECREF(unicode->defenc);
        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
428 429 430
    }
}

431 432
static
int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
433 434 435 436 437
{
    register PyUnicodeObject *v;

    /* Argument checks */
    if (unicode == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
438 439
        PyErr_BadInternalCall();
        return -1;
440
    }
441
    v = *unicode;
442
    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
443 444
        PyErr_BadInternalCall();
        return -1;
445 446 447 448 449
    }

    /* Resizing unicode_empty and single character objects is not
       possible since these are being shared. We simply return a fresh
       copy with the same Unicode content. */
450
    if (v->length != length &&
Benjamin Peterson's avatar
Benjamin Peterson committed
451 452 453 454 455 456 457 458 459
        (v == unicode_empty || v->length == 1)) {
        PyUnicodeObject *w = _PyUnicode_New(length);
        if (w == NULL)
            return -1;
        Py_UNICODE_COPY(w->str, v->str,
                        length < v->length ? length : v->length);
        Py_DECREF(*unicode);
        *unicode = w;
        return 0;
460 461 462 463 464 465 466
    }

    /* Note that we don't have to modify *unicode for unshared Unicode
       objects, since we can modify them in-place. */
    return unicode_resize(v, length);
}

467 468 469 470
int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
{
    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
}
471

472
PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
Benjamin Peterson's avatar
Benjamin Peterson committed
473
                                Py_ssize_t size)
474 475 476
{
    PyUnicodeObject *unicode;

477 478 479 480
    /* If the Unicode data is known at construction time, we can apply
       some optimizations which share commonly used objects. */
    if (u != NULL) {

Benjamin Peterson's avatar
Benjamin Peterson committed
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
        /* Optimization for empty strings */
        if (size == 0 && unicode_empty != NULL) {
            Py_INCREF(unicode_empty);
            return (PyObject *)unicode_empty;
        }

        /* Single character Unicode objects in the Latin-1 range are
           shared when using this constructor */
        if (size == 1 && *u < 256) {
            unicode = unicode_latin1[*u];
            if (!unicode) {
                unicode = _PyUnicode_New(1);
                if (!unicode)
                    return NULL;
                unicode->str[0] = *u;
                unicode_latin1[*u] = unicode;
            }
            Py_INCREF(unicode);
            return (PyObject *)unicode;
500
        }
501
    }
502

503 504 505 506 507 508
    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;

    /* Copy the Unicode data into the new object */
    if (u != NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
509
        Py_UNICODE_COPY(unicode->str, u, size);
510 511 512 513

    return (PyObject *)unicode;
}

514
PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
515 516
{
    PyUnicodeObject *unicode;
Christian Heimes's avatar
Christian Heimes committed
517

518 519
    if (size < 0) {
        PyErr_SetString(PyExc_SystemError,
Benjamin Peterson's avatar
Benjamin Peterson committed
520
                        "Negative size passed to PyUnicode_FromStringAndSize");
521 522
        return NULL;
    }
Christian Heimes's avatar
Christian Heimes committed
523

524
    /* If the Unicode data is known at construction time, we can apply
525 526 527
       some optimizations which share commonly used objects.
       Also, this means the input must be UTF-8, so fall back to the
       UTF-8 decoder at the end. */
528 529
    if (u != NULL) {

Benjamin Peterson's avatar
Benjamin Peterson committed
530 531 532 533 534
        /* Optimization for empty strings */
        if (size == 0 && unicode_empty != NULL) {
            Py_INCREF(unicode_empty);
            return (PyObject *)unicode_empty;
        }
535

Benjamin Peterson's avatar
Benjamin Peterson committed
536
        /* Single characters are shared when using this constructor.
537
           Restrict to ASCII, since the input must be UTF-8. */
Benjamin Peterson's avatar
Benjamin Peterson committed
538 539 540 541 542 543 544 545 546 547 548
        if (size == 1 && Py_CHARMASK(*u) < 128) {
            unicode = unicode_latin1[Py_CHARMASK(*u)];
            if (!unicode) {
                unicode = _PyUnicode_New(1);
                if (!unicode)
                    return NULL;
                unicode->str[0] = Py_CHARMASK(*u);
                unicode_latin1[Py_CHARMASK(*u)] = unicode;
            }
            Py_INCREF(unicode);
            return (PyObject *)unicode;
549
        }
550 551

        return PyUnicode_DecodeUTF8(u, size, NULL);
552 553
    }

554
    unicode = _PyUnicode_New(size);
555 556 557 558 559 560
    if (!unicode)
        return NULL;

    return (PyObject *)unicode;
}

561 562 563 564 565 566 567 568 569 570 571
PyObject *PyUnicode_FromString(const char *u)
{
    size_t size = strlen(u);
    if (size > PY_SSIZE_T_MAX) {
        PyErr_SetString(PyExc_OverflowError, "input too long");
        return NULL;
    }

    return PyUnicode_FromStringAndSize(u, size);
}

572 573
#ifdef HAVE_WCHAR_H

574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
# define CONVERT_WCHAR_TO_SURROGATES
#endif

#ifdef CONVERT_WCHAR_TO_SURROGATES

/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
   to convert from UTF32 to UTF16. */

PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
                                 Py_ssize_t size)
{
    PyUnicodeObject *unicode;
    register Py_ssize_t i;
    Py_ssize_t alloc;
    const wchar_t *orig_w;

    if (w == NULL) {
        if (size == 0)
            return PyUnicode_FromStringAndSize(NULL, 0);
        PyErr_BadInternalCall();
        return NULL;
    }

    if (size == -1) {
        size = wcslen(w);
    }

    alloc = size;
    orig_w = w;
    for (i = size; i > 0; i--) {
        if (*w > 0xFFFF)
            alloc++;
        w++;
    }
    w = orig_w;
    unicode = _PyUnicode_New(alloc);
    if (!unicode)
        return NULL;

    /* Copy the wchar_t data into the new object */
    {
        register Py_UNICODE *u;
        u = PyUnicode_AS_UNICODE(unicode);
        for (i = size; i > 0; i--) {
            if (*w > 0xFFFF) {
                wchar_t ordinal = *w++;
                ordinal -= 0x10000;
                *u++ = 0xD800 | (ordinal >> 10);
                *u++ = 0xDC00 | (ordinal & 0x3FF);
            }
            else
                *u++ = *w++;
        }
    }
    return (PyObject *)unicode;
}

#else

634
PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
Benjamin Peterson's avatar
Benjamin Peterson committed
635
                                 Py_ssize_t size)
636 637 638 639
{
    PyUnicodeObject *unicode;

    if (w == NULL) {
640 641
        if (size == 0)
            return PyUnicode_FromStringAndSize(NULL, 0);
Benjamin Peterson's avatar
Benjamin Peterson committed
642 643
        PyErr_BadInternalCall();
        return NULL;
644 645
    }

646 647 648 649
    if (size == -1) {
        size = wcslen(w);
    }

650 651 652 653 654
    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;

    /* Copy the wchar_t data into the new object */
655
#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
656
    memcpy(unicode->str, w, size * sizeof(wchar_t));
657
#else
658
    {
Benjamin Peterson's avatar
Benjamin Peterson committed
659 660 661 662 663
        register Py_UNICODE *u;
        register Py_ssize_t i;
        u = PyUnicode_AS_UNICODE(unicode);
        for (i = size; i > 0; i--)
            *u++ = *w++;
664 665 666 667 668 669
    }
#endif

    return (PyObject *)unicode;
}

670 671 672 673
#endif /* CONVERT_WCHAR_TO_SURROGATES */

#undef CONVERT_WCHAR_TO_SURROGATES

674
static void
675 676
makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
        int zeropad, int width, int precision, char c)
677
{
678 679 680 681 682 683 684 685 686 687
    *fmt++ = '%';
    if (width) {
        if (zeropad)
            *fmt++ = '0';
        fmt += sprintf(fmt, "%d", width);
    }
    if (precision)
        fmt += sprintf(fmt, ".%d", precision);
    if (longflag)
        *fmt++ = 'l';
688 689 690 691 692 693 694 695 696 697 698 699 700
    else if (longlongflag) {
        /* longlongflag should only ever be nonzero on machines with
           HAVE_LONG_LONG defined */
#ifdef HAVE_LONG_LONG
        char *f = PY_FORMAT_LONG_LONG;
        while (*f)
            *fmt++ = *f++;
#else
        /* we shouldn't ever get here */
        assert(0);
        *fmt++ = 'l';
#endif
    }
701 702 703 704 705 706 707
    else if (size_tflag) {
        char *f = PY_FORMAT_SIZE_T;
        while (*f)
            *fmt++ = *f++;
    }
    *fmt++ = c;
    *fmt = '\0';
708 709
}

710 711
#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}

712 713 714 715 716 717 718 719 720 721
/* size of fixed-size buffer for formatting single arguments */
#define ITEM_BUFFER_LEN 21
/* maximum number of characters required for output of %ld.  21 characters
   allows for 64-bit integers (in decimal) and an optional sign. */
#define MAX_LONG_CHARS 21
/* maximum number of characters required for output of %lld.
   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)

722 723 724
PyObject *
PyUnicode_FromFormatV(const char *format, va_list vargs)
{
725 726 727 728 729 730 731 732 733 734 735 736
    va_list count;
    Py_ssize_t callcount = 0;
    PyObject **callresults = NULL;
    PyObject **callresult = NULL;
    Py_ssize_t n = 0;
    int width = 0;
    int precision = 0;
    int zeropad;
    const char* f;
    Py_UNICODE *s;
    PyObject *string;
    /* used by sprintf */
737
    char buffer[ITEM_BUFFER_LEN+1];
738 739 740 741 742
    /* use abuffer instead of buffer, if we need more space
     * (which can happen if there's a format specifier with width). */
    char *abuffer = NULL;
    char *realbuffer;
    Py_ssize_t abuffersize = 0;
743
    char fmt[61]; /* should be enough for %0width.precisionlld */
744
    const char *copy;
745

746
    Py_VA_COPY(count, vargs);
747 748 749 750
    /* step 1: count the number of %S/%R/%A/%s format specifications
     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
     * result in an array) */
751
    for (f = format; *f; f++) {
752 753 754
         if (*f == '%') {
             if (*(f+1)=='%')
                 continue;
755
             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
756
                 ++callcount;
757
             while (Py_ISDIGIT((unsigned)*f))
758
                 width = (width*10) + *f++ - '0';
759
             while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
760 761 762 763
                 ;
             if (*f == 's')
                 ++callcount;
         }
764 765 766
         else if (128 <= (unsigned char)*f) {
             PyErr_Format(PyExc_ValueError,
                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
767
                "string, got a non-ASCII byte: 0x%02x",
768
                (unsigned char)*f);
769
             return NULL;
770
         }
771 772
    }
    /* step 2: allocate memory for the results of
773
     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
774 775 776 777 778 779 780 781 782 783 784
    if (callcount) {
        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
        if (!callresults) {
            PyErr_NoMemory();
            return NULL;
        }
        callresult = callresults;
    }
    /* step 3: figure out how large a buffer we need */
    for (f = format; *f; f++) {
        if (*f == '%') {
785 786 787
#ifdef HAVE_LONG_LONG
            int longlongflag = 0;
#endif
788 789
            const char* p = f;
            width = 0;
790
            while (Py_ISDIGIT((unsigned)*f))
791
                width = (width*10) + *f++ - '0';
792
            while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
793 794 795 796 797
                ;

            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
             * they don't affect the amount of space we reserve.
             */
798 799 800 801 802 803 804 805 806 807 808 809 810
            if (*f == 'l') {
                if (f[1] == 'd' || f[1] == 'u') {
                    ++f;
                }
#ifdef HAVE_LONG_LONG
                else if (f[1] == 'l' &&
                         (f[2] == 'd' || f[2] == 'u')) {
                    longlongflag = 1;
                    f += 2;
                }
#endif
            }
            else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
Benjamin Peterson's avatar
Benjamin Peterson committed
811
                ++f;
812
            }
813

814 815
            switch (*f) {
            case 'c':
816 817 818 819 820 821 822 823
            {
#ifndef Py_UNICODE_WIDE
                int ordinal = va_arg(count, int);
                if (ordinal > 0xffff)
                    n += 2;
                else
                    n++;
#else
824
                (void)va_arg(count, int);
825 826 827 828
                n++;
#endif
                break;
            }
829 830 831 832 833
            case '%':
                n++;
                break;
            case 'd': case 'u': case 'i': case 'x':
                (void) va_arg(count, int);
834 835 836 837 838 839 840 841 842 843 844 845 846
#ifdef HAVE_LONG_LONG
                if (longlongflag) {
                    if (width < MAX_LONG_LONG_CHARS)
                        width = MAX_LONG_LONG_CHARS;
                }
                else
#endif
                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
                       including sign.  Decimal takes the most space.  This
                       isn't enough for octal.  If a width is specified we
                       need more (which we allocate later). */
                    if (width < MAX_LONG_CHARS)
                        width = MAX_LONG_CHARS;
847
                n += width;
848
                /* XXX should allow for large precision here too. */
849 850 851 852 853 854
                if (abuffersize < width)
                    abuffersize = width;
                break;
            case 's':
            {
                /* UTF-8 */
855
                const char *s = va_arg(count, const char*);
856 857 858 859 860 861
                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
                if (!str)
                    goto fail;
                n += PyUnicode_GET_SIZE(str);
                /* Remember the str and switch to the next slot */
                *callresult++ = str;
862 863 864 865 866 867 868 869 870 871 872 873 874
                break;
            }
            case 'U':
            {
                PyObject *obj = va_arg(count, PyObject *);
                assert(obj && PyUnicode_Check(obj));
                n += PyUnicode_GET_SIZE(obj);
                break;
            }
            case 'V':
            {
                PyObject *obj = va_arg(count, PyObject *);
                const char *str = va_arg(count, const char *);
875
                PyObject *str_obj;
876 877
                assert(obj || str);
                assert(!obj || PyUnicode_Check(obj));
878
                if (obj) {
879
                    n += PyUnicode_GET_SIZE(obj);
880 881 882 883 884 885 886 887 888
                    *callresult++ = NULL;
                }
                else {
                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
                    if (!str_obj)
                        goto fail;
                    n += PyUnicode_GET_SIZE(str_obj);
                    *callresult++ = str_obj;
                }
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
                break;
            }
            case 'S':
            {
                PyObject *obj = va_arg(count, PyObject *);
                PyObject *str;
                assert(obj);
                str = PyObject_Str(obj);
                if (!str)
                    goto fail;
                n += PyUnicode_GET_SIZE(str);
                /* Remember the str and switch to the next slot */
                *callresult++ = str;
                break;
            }
            case 'R':
            {
                PyObject *obj = va_arg(count, PyObject *);
                PyObject *repr;
                assert(obj);
                repr = PyObject_Repr(obj);
                if (!repr)
                    goto fail;
                n += PyUnicode_GET_SIZE(repr);
                /* Remember the repr and switch to the next slot */
                *callresult++ = repr;
                break;
            }
            case 'A':
            {
                PyObject *obj = va_arg(count, PyObject *);
                PyObject *ascii;
                assert(obj);
                ascii = PyObject_ASCII(obj);
                if (!ascii)
                    goto fail;
                n += PyUnicode_GET_SIZE(ascii);
                /* Remember the repr and switch to the next slot */
                *callresult++ = ascii;
                break;
            }
            case 'p':
                (void) va_arg(count, int);
                /* maximum 64-bit pointer representation:
                 * 0xffffffffffffffff
                 * so 19 characters is enough.
                 * XXX I count 18 -- what's the extra for?
                 */
                n += 19;
                break;
            default:
                /* if we stumble upon an unknown
                   formatting code, copy the rest of
                   the format string to the output
                   string. (we cannot just skip the
                   code, since there's no way to know
                   what's in the argument list) */
                n += strlen(p);
                goto expand;
            }
        } else
            n++;
    }
Benjamin Peterson's avatar
Benjamin Peterson committed
952
  expand:
953 954 955
    if (abuffersize > ITEM_BUFFER_LEN) {
        /* add 1 for sprintf's trailing null byte */
        abuffer = PyObject_Malloc(abuffersize + 1);
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978
        if (!abuffer) {
            PyErr_NoMemory();
            goto fail;
        }
        realbuffer = abuffer;
    }
    else
        realbuffer = buffer;
    /* step 4: fill the buffer */
    /* Since we've analyzed how much space we need for the worst case,
       we don't have to resize the string.
       There can be no errors beyond this point. */
    string = PyUnicode_FromUnicode(NULL, n);
    if (!string)
        goto fail;

    s = PyUnicode_AS_UNICODE(string);
    callresult = callresults;

    for (f = format; *f; f++) {
        if (*f == '%') {
            const char* p = f++;
            int longflag = 0;
979
            int longlongflag = 0;
980 981 982 983
            int size_tflag = 0;
            zeropad = (*f == '0');
            /* parse the width.precision part */
            width = 0;
984
            while (Py_ISDIGIT((unsigned)*f))
985 986 987 988
                width = (width*10) + *f++ - '0';
            precision = 0;
            if (*f == '.') {
                f++;
989
                while (Py_ISDIGIT((unsigned)*f))
990 991
                    precision = (precision*10) + *f++ - '0';
            }
992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004
            /* Handle %ld, %lu, %lld and %llu. */
            if (*f == 'l') {
                if (f[1] == 'd' || f[1] == 'u') {
                    longflag = 1;
                    ++f;
                }
#ifdef HAVE_LONG_LONG
                else if (f[1] == 'l' &&
                         (f[2] == 'd' || f[2] == 'u')) {
                    longlongflag = 1;
                    f += 2;
                }
#endif
1005 1006 1007 1008 1009 1010 1011 1012 1013
            }
            /* handle the size_t flag. */
            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
                size_tflag = 1;
                ++f;
            }

            switch (*f) {
            case 'c':
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
            {
                int ordinal = va_arg(vargs, int);
#ifndef Py_UNICODE_WIDE
                if (ordinal > 0xffff) {
                    ordinal -= 0x10000;
                    *s++ = 0xD800 | (ordinal >> 10);
                    *s++ = 0xDC00 | (ordinal & 0x3FF);
                } else
#endif
                *s++ = ordinal;
1024
                break;
1025
            }
1026
            case 'd':
1027 1028
                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
                        width, precision, 'd');
1029 1030
                if (longflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, long));
1031 1032 1033 1034
#ifdef HAVE_LONG_LONG
                else if (longlongflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
#endif
1035 1036 1037 1038 1039 1040 1041
                else if (size_tflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
                else
                    sprintf(realbuffer, fmt, va_arg(vargs, int));
                appendstring(realbuffer);
                break;
            case 'u':
1042 1043
                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
                        width, precision, 'u');
1044 1045
                if (longflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1046 1047 1048 1049 1050
#ifdef HAVE_LONG_LONG
                else if (longlongflag)
                    sprintf(realbuffer, fmt, va_arg(vargs,
                                                    unsigned PY_LONG_LONG));
#endif
1051 1052 1053 1054 1055 1056 1057
                else if (size_tflag)
                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
                else
                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
                appendstring(realbuffer);
                break;
            case 'i':
1058
                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
1059 1060 1061 1062
                sprintf(realbuffer, fmt, va_arg(vargs, int));
                appendstring(realbuffer);
                break;
            case 'x':
1063
                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1064 1065 1066 1067 1068
                sprintf(realbuffer, fmt, va_arg(vargs, int));
                appendstring(realbuffer);
                break;
            case 's':
            {
1069 1070 1071 1072 1073 1074 1075 1076 1077
                /* unused, since we already have the result */
                (void) va_arg(vargs, char *);
                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
                                PyUnicode_GET_SIZE(*callresult));
                s += PyUnicode_GET_SIZE(*callresult);
                /* We're done with the unicode()/repr() => forget it */
                Py_DECREF(*callresult);
                /* switch to next unicode()/repr() result */
                ++callresult;
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
                break;
            }
            case 'U':
            {
                PyObject *obj = va_arg(vargs, PyObject *);
                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
                s += size;
                break;
            }
            case 'V':
            {
                PyObject *obj = va_arg(vargs, PyObject *);
1091
                va_arg(vargs, const char *);
1092 1093 1094 1095 1096
                if (obj) {
                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
                    s += size;
                } else {
1097 1098 1099 1100
                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
                                    PyUnicode_GET_SIZE(*callresult));
                    s += PyUnicode_GET_SIZE(*callresult);
                    Py_DECREF(*callresult);
1101
                }
1102
                ++callresult;
1103 1104 1105 1106
                break;
            }
            case 'S':
            case 'R':
1107
            case 'A':
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142
            {
                Py_UNICODE *ucopy;
                Py_ssize_t usize;
                Py_ssize_t upos;
                /* unused, since we already have the result */
                (void) va_arg(vargs, PyObject *);
                ucopy = PyUnicode_AS_UNICODE(*callresult);
                usize = PyUnicode_GET_SIZE(*callresult);
                for (upos = 0; upos<usize;)
                    *s++ = ucopy[upos++];
                /* We're done with the unicode()/repr() => forget it */
                Py_DECREF(*callresult);
                /* switch to next unicode()/repr() result */
                ++callresult;
                break;
            }
            case 'p':
                sprintf(buffer, "%p", va_arg(vargs, void*));
                /* %p is ill-defined:  ensure leading 0x. */
                if (buffer[1] == 'X')
                    buffer[1] = 'x';
                else if (buffer[1] != 'x') {
                    memmove(buffer+2, buffer, strlen(buffer)+1);
                    buffer[0] = '0';
                    buffer[1] = 'x';
                }
                appendstring(buffer);
                break;
            case '%':
                *s++ = '%';
                break;
            default:
                appendstring(p);
                goto end;
            }
1143 1144
        }
        else
1145 1146
            *s++ = *f;
    }
1147

Benjamin Peterson's avatar
Benjamin Peterson committed
1148
  end:
1149 1150 1151 1152 1153 1154
    if (callresults)
        PyObject_Free(callresults);
    if (abuffer)
        PyObject_Free(abuffer);
    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
    return string;
Benjamin Peterson's avatar
Benjamin Peterson committed
1155
  fail:
1156 1157 1158
    if (callresults) {
        PyObject **callresult2 = callresults;
        while (callresult2 < callresult) {
1159
            Py_XDECREF(*callresult2);
1160 1161 1162 1163 1164 1165 1166
            ++callresult2;
        }
        PyObject_Free(callresults);
    }
    if (abuffer)
        PyObject_Free(abuffer);
    return NULL;
1167 1168 1169 1170 1171 1172 1173
}

#undef appendstring

PyObject *
PyUnicode_FromFormat(const char *format, ...)
{
1174 1175
    PyObject* ret;
    va_list vargs;
1176 1177

#ifdef HAVE_STDARG_PROTOTYPES
1178
    va_start(vargs, format);
1179
#else
1180
    va_start(vargs);
1181
#endif
1182 1183 1184
    ret = PyUnicode_FromFormatV(format, vargs);
    va_end(vargs);
    return ret;
1185 1186
}

1187 1188 1189
/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
   convert a Unicode object to a wide character string.

1190
   - If w is NULL: return the number of wide characters (including the null
1191 1192
     character) required to convert the unicode object. Ignore size argument.

1193
   - Otherwise: return the number of wide characters (excluding the null
1194
     character) written into w. Write at most size wide characters (including
1195
     the null character). */
1196
static Py_ssize_t
1197 1198 1199 1200 1201
unicode_aswidechar(PyUnicodeObject *unicode,
                   wchar_t *w,
                   Py_ssize_t size)
{
#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
    Py_ssize_t res;
    if (w != NULL) {
        res = PyUnicode_GET_SIZE(unicode);
        if (size > res)
            size = res + 1;
        else
            res = size;
        memcpy(w, unicode->str, size * sizeof(wchar_t));
        return res;
    }
    else
        return PyUnicode_GET_SIZE(unicode) + 1;
#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
    register const Py_UNICODE *u;
    const Py_UNICODE *uend;
    const wchar_t *worig, *wend;
    Py_ssize_t nchar;

    u = PyUnicode_AS_UNICODE(unicode);
    uend = u + PyUnicode_GET_SIZE(unicode);
    if (w != NULL) {
        worig = w;
        wend = w + size;
        while (u != uend && w != wend) {
            if (0xD800 <= u[0] && u[0] <= 0xDBFF
                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
            {
                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
                u += 2;
            }
            else {
                *w = *u;
                u++;
            }
            w++;
        }
        if (w != wend)
            *w = L'\0';
        return w - worig;
    }
    else {
1243
        nchar = 1; /* null character at the end */
1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
        while (u != uend) {
            if (0xD800 <= u[0] && u[0] <= 0xDBFF
                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
                u += 2;
            else
                u++;
            nchar++;
        }
    }
    return nchar;
#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
    register Py_UNICODE *u, *uend, ordinal;
1256
    register Py_ssize_t i;
1257 1258 1259
    wchar_t *worig, *wend;
    Py_ssize_t nchar;

1260
    u = PyUnicode_AS_UNICODE(unicode);
1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280
    uend = u + PyUnicode_GET_SIZE(u);
    if (w != NULL) {
        worig = w;
        wend = w + size;
        while (u != uend && w != wend) {
            ordinal = *u;
            if (ordinal > 0xffff) {
                ordinal -= 0x10000;
                *w++ = 0xD800 | (ordinal >> 10);
                *w++ = 0xDC00 | (ordinal & 0x3FF);
            }
            else
                *w++ = ordinal;
            u++;
        }
        if (w != wend)
            *w = 0;
        return w - worig;
    }
    else {
1281
        nchar = 1; /* null character */
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
        while (u != uend) {
            if (*u > 0xffff)
                nchar += 2;
            else
                nchar++;
            u++;
        }
        return nchar;
    }
#else
#  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1293 1294 1295 1296
#endif
}

Py_ssize_t
Martin v. Löwis's avatar
Martin v. Löwis committed
1297
PyUnicode_AsWideChar(PyObject *unicode,
1298 1299
                     wchar_t *w,
                     Py_ssize_t size)
1300 1301
{
    if (unicode == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
1302 1303
        PyErr_BadInternalCall();
        return -1;
1304
    }
Martin v. Löwis's avatar
Martin v. Löwis committed
1305
    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
1306
}
1307

1308
wchar_t*
1309
PyUnicode_AsWideCharString(PyObject *unicode,
1310 1311 1312 1313
                           Py_ssize_t *size)
{
    wchar_t* buffer;
    Py_ssize_t buflen;
1314

1315 1316 1317
    if (unicode == NULL) {
        PyErr_BadInternalCall();
        return NULL;
1318 1319
    }

1320
    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1321
    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1322 1323 1324 1325 1326 1327 1328 1329 1330
        PyErr_NoMemory();
        return NULL;
    }

    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
    if (buffer == NULL) {
        PyErr_NoMemory();
        return NULL;
    }
1331
    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1332 1333
    if (size != NULL)
        *size = buflen;
1334
    return buffer;
1335 1336 1337 1338
}

#endif

1339 1340
PyObject *PyUnicode_FromOrdinal(int ordinal)
{
1341
    Py_UNICODE s[2];
1342 1343

    if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson's avatar
Benjamin Peterson committed
1344 1345 1346
        PyErr_SetString(PyExc_ValueError,
                        "chr() arg not in range(0x110000)");
        return NULL;
1347
    }
1348 1349 1350 1351 1352 1353 1354

#ifndef Py_UNICODE_WIDE
    if (ordinal > 0xffff) {
        ordinal -= 0x10000;
        s[0] = 0xD800 | (ordinal >> 10);
        s[1] = 0xDC00 | (ordinal & 0x3FF);
        return PyUnicode_FromUnicode(s, 2);
1355 1356 1357
    }
#endif

1358 1359
    s[0] = (Py_UNICODE)ordinal;
    return PyUnicode_FromUnicode(s, 1);
1360 1361
}

1362
PyObject *PyUnicode_FromObject(register PyObject *obj)
1363
{
1364
    /* XXX Perhaps we should make this API an alias of
Benjamin Peterson's avatar
Benjamin Peterson committed
1365
       PyObject_Str() instead ?! */
1366
    if (PyUnicode_CheckExact(obj)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
1367 1368
        Py_INCREF(obj);
        return obj;
1369 1370
    }
    if (PyUnicode_Check(obj)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
1371 1372 1373 1374
        /* For a Unicode subtype that's not a Unicode object,
           return a true Unicode object with the same data. */
        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
                                     PyUnicode_GET_SIZE(obj));
1375
    }
1376 1377
    PyErr_Format(PyExc_TypeError,
                 "Can't convert '%.100s' object to str implicitly",
1378
                 Py_TYPE(obj)->tp_name);
1379
    return NULL;
1380 1381 1382
}

PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
Benjamin Peterson's avatar
Benjamin Peterson committed
1383 1384
                                      const char *encoding,
                                      const char *errors)
1385
{
1386
    Py_buffer buffer;
1387
    PyObject *v;
1388

1389
    if (obj == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
1390 1391
        PyErr_BadInternalCall();
        return NULL;
1392
    }
1393

1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407
    /* Decoding bytes objects is the most common case and should be fast */
    if (PyBytes_Check(obj)) {
        if (PyBytes_GET_SIZE(obj) == 0) {
            Py_INCREF(unicode_empty);
            v = (PyObject *) unicode_empty;
        }
        else {
            v = PyUnicode_Decode(
                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
                    encoding, errors);
        }
        return v;
    }

1408
    if (PyUnicode_Check(obj)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
1409 1410 1411
        PyErr_SetString(PyExc_TypeError,
                        "decoding str is not supported");
        return NULL;
1412
    }
1413

1414 1415 1416 1417 1418 1419 1420
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
        PyErr_Format(PyExc_TypeError,
                     "coercing to str: need bytes, bytearray "
                     "or buffer-like object, %.80s found",
                     Py_TYPE(obj)->tp_name);
        return NULL;
1421
    }
1422

1423
    if (buffer.len == 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
1424
        Py_INCREF(unicode_empty);
1425
        v = (PyObject *) unicode_empty;
1426
    }
1427
    else
1428
        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1429

1430
    PyBuffer_Release(&buffer);
1431
    return v;
1432 1433
}

1434
/* Convert encoding to lower case and replace '_' with '-' in order to
1435 1436 1437 1438 1439 1440
   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
   1 on success. */
static int
normalize_encoding(const char *encoding,
                   char *lower,
                   size_t lower_len)
1441
{
1442
    const char *e;
1443 1444
    char *l;
    char *l_end;
1445

1446 1447
    e = encoding;
    l = lower;
1448
    l_end = &lower[lower_len - 1];
1449 1450 1451
    while (*e) {
        if (l == l_end)
            return 0;
1452 1453
        if (Py_ISUPPER(*e)) {
            *l++ = Py_TOLOWER(*e++);
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463
        }
        else if (*e == '_') {
            *l++ = '-';
            e++;
        }
        else {
            *l++ = *e++;
        }
    }
    *l = '\0';
1464
    return 1;
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477
}

PyObject *PyUnicode_Decode(const char *s,
                           Py_ssize_t size,
                           const char *encoding,
                           const char *errors)
{
    PyObject *buffer = NULL, *unicode;
    Py_buffer info;
    char lower[11];  /* Enough for any encoding shortcut */

    if (encoding == NULL)
        encoding = PyUnicode_GetDefaultEncoding();
1478 1479

    /* Shortcuts for common default encodings */
1480 1481 1482 1483 1484 1485
    if (normalize_encoding(encoding, lower, sizeof(lower))) {
        if (strcmp(lower, "utf-8") == 0)
            return PyUnicode_DecodeUTF8(s, size, errors);
        else if ((strcmp(lower, "latin-1") == 0) ||
                 (strcmp(lower, "iso-8859-1") == 0))
            return PyUnicode_DecodeLatin1(s, size, errors);
1486
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1487 1488
        else if (strcmp(lower, "mbcs") == 0)
            return PyUnicode_DecodeMBCS(s, size, errors);
1489
#endif
1490 1491 1492 1493 1494 1495 1496
        else if (strcmp(lower, "ascii") == 0)
            return PyUnicode_DecodeASCII(s, size, errors);
        else if (strcmp(lower, "utf-16") == 0)
            return PyUnicode_DecodeUTF16(s, size, errors, 0);
        else if (strcmp(lower, "utf-32") == 0)
            return PyUnicode_DecodeUTF32(s, size, errors, 0);
    }
1497 1498

    /* Decode via the codec registry */
Guido van Rossum's avatar
Guido van Rossum committed
1499
    buffer = NULL;
1500
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossum's avatar
Guido van Rossum committed
1501
        goto onError;
1502
    buffer = PyMemoryView_FromBuffer(&info);
1503 1504 1505 1506 1507 1508 1509
    if (buffer == NULL)
        goto onError;
    unicode = PyCodec_Decode(buffer, encoding, errors);
    if (unicode == NULL)
        goto onError;
    if (!PyUnicode_Check(unicode)) {
        PyErr_Format(PyExc_TypeError,
1510
                     "decoder did not return a str object (type=%.400s)",
1511
                     Py_TYPE(unicode)->tp_name);
1512 1513 1514 1515 1516
        Py_DECREF(unicode);
        goto onError;
    }
    Py_DECREF(buffer);
    return unicode;
1517

Benjamin Peterson's avatar
Benjamin Peterson committed
1518
  onError:
1519 1520 1521 1522
    Py_XDECREF(buffer);
    return NULL;
}

1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
                                    const char *encoding,
                                    const char *errors)
{
    PyObject *v;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }

    if (encoding == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
1535
        encoding = PyUnicode_GetDefaultEncoding();
1536 1537 1538 1539 1540 1541 1542

    /* Decode via the codec registry */
    v = PyCodec_Decode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
    return v;

Benjamin Peterson's avatar
Benjamin Peterson committed
1543
  onError:
1544 1545 1546
    return NULL;
}

1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
                                     const char *encoding,
                                     const char *errors)
{
    PyObject *v;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }

    if (encoding == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
1559
        encoding = PyUnicode_GetDefaultEncoding();
1560 1561 1562 1563 1564 1565 1566

    /* Decode via the codec registry */
    v = PyCodec_Decode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
    if (!PyUnicode_Check(v)) {
        PyErr_Format(PyExc_TypeError,
1567
                     "decoder did not return a str object (type=%.400s)",
1568 1569 1570 1571 1572 1573
                     Py_TYPE(v)->tp_name);
        Py_DECREF(v);
        goto onError;
    }
    return v;

Benjamin Peterson's avatar
Benjamin Peterson committed
1574
  onError:
1575 1576 1577
    return NULL;
}

1578
PyObject *PyUnicode_Encode(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
1579 1580 1581
                           Py_ssize_t size,
                           const char *encoding,
                           const char *errors)
1582 1583
{
    PyObject *v, *unicode;
1584

1585 1586
    unicode = PyUnicode_FromUnicode(s, size);
    if (unicode == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
1587
        return NULL;
1588 1589 1590 1591 1592
    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
    Py_DECREF(unicode);
    return v;
}

1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604
PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
                                    const char *encoding,
                                    const char *errors)
{
    PyObject *v;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }

    if (encoding == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
1605
        encoding = PyUnicode_GetDefaultEncoding();
1606 1607 1608 1609 1610 1611 1612

    /* Encode via the codec registry */
    v = PyCodec_Encode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
    return v;

Benjamin Peterson's avatar
Benjamin Peterson committed
1613
  onError:
1614 1615 1616
    return NULL;
}

1617 1618
PyObject *
PyUnicode_EncodeFSDefault(PyObject *unicode)
1619
{
1620
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1621 1622 1623 1624 1625 1626 1627 1628
    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
                                PyUnicode_GET_SIZE(unicode),
                                NULL);
#elif defined(__APPLE__)
    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
                                PyUnicode_GET_SIZE(unicode),
                                "surrogateescape");
#else
1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639
    PyInterpreterState *interp = PyThreadState_GET()->interp;
    /* Bootstrap check: if the filesystem codec is implemented in Python, we
       cannot use it to encode and decode filenames before it is loaded. Load
       the Python codec requires to encode at least its own filename. Use the C
       version of the locale codec until the codec registry is initialized and
       the Python codec is loaded.

       Py_FileSystemDefaultEncoding is shared between all interpreters, we
       cannot only rely on it: check also interp->fscodec_initialized for
       subinterpreters. */
    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1640 1641 1642
        return PyUnicode_AsEncodedString(unicode,
                                         Py_FileSystemDefaultEncoding,
                                         "surrogateescape");
1643 1644
    }
    else {
1645 1646 1647 1648
        /* locale encoding with surrogateescape */
        wchar_t *wchar;
        char *bytes;
        PyObject *bytes_obj;
1649
        size_t error_pos;
1650 1651 1652 1653

        wchar = PyUnicode_AsWideCharString(unicode, NULL);
        if (wchar == NULL)
            return NULL;
1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670
        bytes = _Py_wchar2char(wchar, &error_pos);
        if (bytes == NULL) {
            if (error_pos != (size_t)-1) {
                char *errmsg = strerror(errno);
                PyObject *exc = NULL;
                if (errmsg == NULL)
                    errmsg = "Py_wchar2char() failed";
                raise_encode_exception(&exc,
                    "filesystemencoding",
                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
                    error_pos, error_pos+1,
                    errmsg);
                Py_XDECREF(exc);
            }
            else
                PyErr_NoMemory();
            PyMem_Free(wchar);
1671
            return NULL;
1672 1673
        }
        PyMem_Free(wchar);
1674 1675 1676 1677

        bytes_obj = PyBytes_FromString(bytes);
        PyMem_Free(bytes);
        return bytes_obj;
1678
    }
1679
#endif
1680 1681
}

1682 1683 1684 1685 1686
PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
                                    const char *encoding,
                                    const char *errors)
{
    PyObject *v;
1687
    char lower[11];  /* Enough for any encoding shortcut */
1688

1689 1690
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
1691
        return NULL;
1692
    }
1693

1694
    if (encoding == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
1695
        encoding = PyUnicode_GetDefaultEncoding();
1696 1697

    /* Shortcuts for common default encodings */
1698 1699 1700 1701 1702 1703 1704 1705 1706 1707
    if (normalize_encoding(encoding, lower, sizeof(lower))) {
        if (strcmp(lower, "utf-8") == 0)
            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
                                        PyUnicode_GET_SIZE(unicode),
                                        errors);
        else if ((strcmp(lower, "latin-1") == 0) ||
                 (strcmp(lower, "iso-8859-1") == 0))
            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
                                          PyUnicode_GET_SIZE(unicode),
                                          errors);
1708
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1709 1710 1711 1712
        else if (strcmp(lower, "mbcs") == 0)
            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
                                        PyUnicode_GET_SIZE(unicode),
                                        errors);
1713
#endif
1714 1715 1716 1717 1718
        else if (strcmp(lower, "ascii") == 0)
            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
                                         PyUnicode_GET_SIZE(unicode),
                                         errors);
    }
1719 1720 1721 1722 1723 1724 1725 1726 1727
    /* During bootstrap, we may need to find the encodings
       package, to load the file system encoding, and require the
       file system encoding in order to load the encodings
       package.

       Break out of this dependency by assuming that the path to
       the encodings module is ASCII-only.  XXX could try wcstombs
       instead, if the file system encoding is the locale's
       encoding. */
1728
    if (Py_FileSystemDefaultEncoding &&
1729 1730 1731 1732 1733
             strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
             !PyThreadState_GET()->interp->codecs_initialized)
        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
                                     PyUnicode_GET_SIZE(unicode),
                                     errors);
1734 1735 1736 1737

    /* Encode via the codec registry */
    v = PyCodec_Encode(unicode, encoding, errors);
    if (v == NULL)
1738 1739 1740 1741 1742 1743 1744
        return NULL;

    /* The normal path */
    if (PyBytes_Check(v))
        return v;

    /* If the codec returns a buffer, raise a warning and convert to bytes */
1745
    if (PyByteArray_Check(v)) {
1746
        int error;
1747
        PyObject *b;
1748 1749 1750 1751 1752

        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
            "encoder %s returned bytearray instead of bytes",
            encoding);
        if (error) {
1753 1754
            Py_DECREF(v);
            return NULL;
1755
        }
1756 1757 1758 1759

        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
        Py_DECREF(v);
        return b;
1760 1761
    }

1762 1763 1764 1765
    PyErr_Format(PyExc_TypeError,
                 "encoder did not return a bytes object (type=%.400s)",
                 Py_TYPE(v)->tp_name);
    Py_DECREF(v);
1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780
    return NULL;
}

PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
                                     const char *encoding,
                                     const char *errors)
{
    PyObject *v;

    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }

    if (encoding == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
1781
        encoding = PyUnicode_GetDefaultEncoding();
1782 1783 1784 1785 1786 1787 1788

    /* Encode via the codec registry */
    v = PyCodec_Encode(unicode, encoding, errors);
    if (v == NULL)
        goto onError;
    if (!PyUnicode_Check(v)) {
        PyErr_Format(PyExc_TypeError,
1789
                     "encoder did not return an str object (type=%.400s)",
1790 1791 1792 1793
                     Py_TYPE(v)->tp_name);
        Py_DECREF(v);
        goto onError;
    }
1794
    return v;
1795

Benjamin Peterson's avatar
Benjamin Peterson committed
1796
  onError:
1797 1798 1799
    return NULL;
}

1800
PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
Benjamin Peterson's avatar
Benjamin Peterson committed
1801
                                            const char *errors)
1802 1803 1804 1805
{
    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
    if (v)
        return v;
1806 1807
    if (errors != NULL)
        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1808
    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1809 1810
                             PyUnicode_GET_SIZE(unicode),
                             NULL);
1811
    if (!v)
1812
        return NULL;
1813
    ((PyUnicodeObject *)unicode)->defenc = v;
1814 1815 1816
    return v;
}

1817
PyObject*
1818
PyUnicode_DecodeFSDefault(const char *s) {
1819
    Py_ssize_t size = (Py_ssize_t)strlen(s);
1820 1821
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
}
1822

1823 1824 1825
PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
{
1826
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1827
    return PyUnicode_DecodeMBCS(s, size, NULL);
1828
#elif defined(__APPLE__)
1829 1830
    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
#else
1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841
    PyInterpreterState *interp = PyThreadState_GET()->interp;
    /* Bootstrap check: if the filesystem codec is implemented in Python, we
       cannot use it to encode and decode filenames before it is loaded. Load
       the Python codec requires to encode at least its own filename. Use the C
       version of the locale codec until the codec registry is initialized and
       the Python codec is loaded.

       Py_FileSystemDefaultEncoding is shared between all interpreters, we
       cannot only rely on it: check also interp->fscodec_initialized for
       subinterpreters. */
    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1842 1843
        return PyUnicode_Decode(s, size,
                                Py_FileSystemDefaultEncoding,
1844
                                "surrogateescape");
1845 1846
    }
    else {
1847 1848 1849
        /* locale encoding with surrogateescape */
        wchar_t *wchar;
        PyObject *unicode;
1850
        size_t len;
1851 1852 1853 1854 1855 1856

        if (s[size] != '\0' || size != strlen(s)) {
            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
            return NULL;
        }

1857
        wchar = _Py_char2wchar(s, &len);
1858
        if (wchar == NULL)
1859
            return PyErr_NoMemory();
1860

1861
        unicode = PyUnicode_FromWideChar(wchar, len);
1862 1863
        PyMem_Free(wchar);
        return unicode;
1864
    }
1865
#endif
1866 1867
}

1868

1869 1870 1871 1872 1873 1874 1875 1876 1877 1878
int
_PyUnicode_HasNULChars(PyObject* s)
{
    static PyObject *nul = NULL;

    if (nul == NULL)
        nul = PyUnicode_FromStringAndSize("\0", 1);
    if (nul == NULL)
        return -1;
    return PyUnicode_Contains(s, nul);
1879 1880
}

1881 1882 1883 1884 1885 1886 1887

int
PyUnicode_FSConverter(PyObject* arg, void* addr)
{
    PyObject *output = NULL;
    Py_ssize_t size;
    void *data;
1888 1889 1890 1891
    if (arg == NULL) {
        Py_DECREF(*(PyObject**)addr);
        return 1;
    }
1892
    if (PyBytes_Check(arg)) {
1893 1894 1895 1896 1897 1898 1899
        output = arg;
        Py_INCREF(output);
    }
    else {
        arg = PyUnicode_FromObject(arg);
        if (!arg)
            return 0;
1900
        output = PyUnicode_EncodeFSDefault(arg);
1901 1902 1903 1904 1905 1906 1907 1908 1909
        Py_DECREF(arg);
        if (!output)
            return 0;
        if (!PyBytes_Check(output)) {
            Py_DECREF(output);
            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
            return 0;
        }
    }
1910 1911
    size = PyBytes_GET_SIZE(output);
    data = PyBytes_AS_STRING(output);
1912
    if (size != strlen(data)) {
1913
        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1914 1915 1916 1917
        Py_DECREF(output);
        return 0;
    }
    *(PyObject**)addr = output;
1918
    return Py_CLEANUP_SUPPORTED;
1919 1920 1921
}


1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934
int
PyUnicode_FSDecoder(PyObject* arg, void* addr)
{
    PyObject *output = NULL;
    Py_ssize_t size;
    void *data;
    if (arg == NULL) {
        Py_DECREF(*(PyObject**)addr);
        return 1;
    }
    if (PyUnicode_Check(arg)) {
        output = arg;
        Py_INCREF(output);
1935
    }
1936
    else {
1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
        arg = PyBytes_FromObject(arg);
        if (!arg)
            return 0;
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
                                                  PyBytes_GET_SIZE(arg));
        Py_DECREF(arg);
        if (!output)
            return 0;
        if (!PyUnicode_Check(output)) {
            Py_DECREF(output);
            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
            return 0;
        }
1950
    }
1951 1952 1953
    size = PyUnicode_GET_SIZE(output);
    data = PyUnicode_AS_UNICODE(output);
    if (size != Py_UNICODE_strlen(data)) {
1954 1955 1956 1957 1958
        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
        Py_DECREF(output);
        return 0;
    }
    *(PyObject**)addr = output;
1959
    return Py_CLEANUP_SUPPORTED;
1960 1961 1962
}


1963
char*
1964
_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1965
{
1966
    PyObject *bytes;
1967 1968 1969 1970
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
1971 1972
    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
    if (bytes == NULL)
1973
        return NULL;
1974
    if (psize != NULL)
1975 1976
        *psize = PyBytes_GET_SIZE(bytes);
    return PyBytes_AS_STRING(bytes);
1977 1978 1979
}

char*
1980
_PyUnicode_AsString(PyObject *unicode)
1981
{
1982
    return _PyUnicode_AsStringAndSize(unicode, NULL);
1983 1984
}

1985 1986 1987 1988 1989 1990 1991 1992
Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }
    return PyUnicode_AS_UNICODE(unicode);

Benjamin Peterson's avatar
Benjamin Peterson committed
1993
  onError:
1994 1995 1996
    return NULL;
}

Martin v. Löwis's avatar
Martin v. Löwis committed
1997
Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1998 1999 2000 2001 2002 2003 2004
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        goto onError;
    }
    return PyUnicode_GET_SIZE(unicode);

Benjamin Peterson's avatar
Benjamin Peterson committed
2005
  onError:
2006 2007 2008
    return -1;
}

2009
const char *PyUnicode_GetDefaultEncoding(void)
2010
{
2011
    return "utf-8";
2012 2013
}

2014 2015 2016 2017 2018 2019 2020
/* create or adjust a UnicodeDecodeError */
static void
make_decode_exception(PyObject **exceptionObject,
                      const char *encoding,
                      const char *input, Py_ssize_t length,
                      Py_ssize_t startpos, Py_ssize_t endpos,
                      const char *reason)
2021
{
2022 2023 2024
    if (*exceptionObject == NULL) {
        *exceptionObject = PyUnicodeDecodeError_Create(
            encoding, input, length, startpos, endpos, reason);
2025
    }
2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
    else {
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
            goto onError;
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
            goto onError;
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
            goto onError;
    }
    return;

onError:
    Py_DECREF(*exceptionObject);
    *exceptionObject = NULL;
2039 2040
}

2041 2042
/* error handling callback helper:
   build arguments, call the callback and check the arguments,
Fred Drake's avatar
Fred Drake committed
2043
   if no exception occurred, copy the replacement to the output
2044 2045 2046 2047 2048 2049
   and adjust various state variables.
   return 0 on success, -1 on error
*/

static
int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Benjamin Peterson's avatar
Benjamin Peterson committed
2050 2051 2052 2053
                                     const char *encoding, const char *reason,
                                     const char **input, const char **inend, Py_ssize_t *startinpos,
                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
2054
{
2055
    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
2056 2057 2058

    PyObject *restuple = NULL;
    PyObject *repunicode = NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
2059
    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
2060
    Py_ssize_t insize;
Martin v. Löwis's avatar
Martin v. Löwis committed
2061 2062
    Py_ssize_t requiredsize;
    Py_ssize_t newpos;
2063
    Py_UNICODE *repptr;
2064
    PyObject *inputobj = NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
2065
    Py_ssize_t repsize;
2066 2067 2068
    int res = -1;

    if (*errorHandler == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
2069 2070 2071
        *errorHandler = PyCodec_LookupError(errors);
        if (*errorHandler == NULL)
            goto onError;
2072 2073
    }

2074 2075 2076 2077 2078 2079 2080
    make_decode_exception(exceptionObject,
        encoding,
        *input, *inend - *input,
        *startinpos, *endinpos,
        reason);
    if (*exceptionObject == NULL)
        goto onError;
2081 2082 2083

    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
    if (restuple == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
2084
        goto onError;
2085
    if (!PyTuple_Check(restuple)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
2086
        PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson's avatar
Benjamin Peterson committed
2087
        goto onError;
2088 2089
    }
    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson's avatar
Benjamin Peterson committed
2090
        goto onError;
2091 2092 2093 2094 2095 2096

    /* Copy back the bytes variables, which might have been modified by the
       callback */
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
    if (!inputobj)
        goto onError;
2097
    if (!PyBytes_Check(inputobj)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
2098
        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
2099
    }
2100 2101
    *input = PyBytes_AS_STRING(inputobj);
    insize = PyBytes_GET_SIZE(inputobj);
2102
    *inend = *input + insize;
2103 2104 2105
    /* we can DECREF safely, as the exception has another reference,
       so the object won't go away. */
    Py_DECREF(inputobj);
2106

2107
    if (newpos<0)
Benjamin Peterson's avatar
Benjamin Peterson committed
2108
        newpos = insize+newpos;
2109
    if (newpos<0 || newpos>insize) {
Benjamin Peterson's avatar
Benjamin Peterson committed
2110 2111
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
        goto onError;
2112
    }
2113 2114 2115 2116 2117 2118 2119 2120 2121

    /* need more space? (at least enough for what we
       have+the replacement+the rest of the string (starting
       at the new input position), so we won't have to check space
       when there are no errors in the rest of the string) */
    repptr = PyUnicode_AS_UNICODE(repunicode);
    repsize = PyUnicode_GET_SIZE(repunicode);
    requiredsize = *outpos + repsize + insize-newpos;
    if (requiredsize > outsize) {
Benjamin Peterson's avatar
Benjamin Peterson committed
2122 2123 2124 2125 2126
        if (requiredsize<2*outsize)
            requiredsize = 2*outsize;
        if (_PyUnicode_Resize(output, requiredsize) < 0)
            goto onError;
        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
2127 2128
    }
    *endinpos = newpos;
2129
    *inptr = *input + newpos;
2130 2131 2132
    Py_UNICODE_COPY(*outptr, repptr, repsize);
    *outptr += repsize;
    *outpos += repsize;
2133

2134 2135 2136
    /* we made it! */
    res = 0;

Benjamin Peterson's avatar
Benjamin Peterson committed
2137
  onError:
2138 2139 2140 2141
    Py_XDECREF(restuple);
    return res;
}

2142 2143
/* --- UTF-7 Codec -------------------------------------------------------- */

2144
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
2145

2146
/* Three simple macros defining base-64. */
2147

2148
/* Is c a base-64 character? */
2149

2150 2151 2152 2153 2154
#define IS_BASE64(c) \
    (((c) >= 'A' && (c) <= 'Z') ||     \
     ((c) >= 'a' && (c) <= 'z') ||     \
     ((c) >= '0' && (c) <= '9') ||     \
     (c) == '+' || (c) == '/')
2155

2156
/* given that c is a base-64 character, what is its base-64 value? */
2157

2158 2159 2160 2161 2162 2163 2164 2165 2166
#define FROM_BASE64(c)                                                  \
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
     (c) == '+' ? 62 : 63)

/* What is the base-64 character of the bottom 6 bits of n? */

#define TO_BASE64(n)  \
2167
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221

/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
 * decoded as itself.  We are permissive on decoding; the only ASCII
 * byte not decoding to itself is the + which begins a base64
 * string. */

#define DECODE_DIRECT(c)                                \
    ((c) <= 127 && (c) != '+')

/* The UTF-7 encoder treats ASCII characters differently according to
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
 * the above).  See RFC2152.  This array identifies these different
 * sets:
 * 0 : "Set D"
 *     alphanumeric and '(),-./:?
 * 1 : "Set O"
 *     !"#$%&*;<=>@[]^_`{|}
 * 2 : "whitespace"
 *     ht nl cr sp
 * 3 : special (must be base64 encoded)
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
 */

static
char utf7_category[128] = {
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
};

/* ENCODE_DIRECT: this character should be encoded as itself.  The
 * answer depends on whether we are encoding set O as itself, and also
 * on whether we are encoding whitespace as itself.  RFC2152 makes it
 * clear that the answers to these questions vary between
 * applications, so this code needs to be flexible.  */

#define ENCODE_DIRECT(c, directO, directWS)             \
    ((c) < 128 && (c) > 0 &&                            \
     ((utf7_category[(c)] == 0) ||                      \
      (directWS && (utf7_category[(c)] == 2)) ||        \
      (directO && (utf7_category[(c)] == 1))))
2222 2223

PyObject *PyUnicode_DecodeUTF7(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
2224 2225
                               Py_ssize_t size,
                               const char *errors)
2226 2227 2228 2229
{
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
}

2230 2231 2232 2233 2234 2235 2236
/* The decoder.  The only state we preserve is our read position,
 * i.e. how many characters we have consumed.  So if we end in the
 * middle of a shift sequence we have to back off the read position
 * and the output to the beginning of the sequence, otherwise we lose
 * all the shift state (seen bits, number of bits seen, high
 * surrogate). */

2237
PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
2238 2239 2240
                                       Py_ssize_t size,
                                       const char *errors,
                                       Py_ssize_t *consumed)
2241
{
2242
    const char *starts = s;
Martin v. Löwis's avatar
Martin v. Löwis committed
2243 2244 2245
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
2246 2247 2248 2249 2250
    const char *e;
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
    const char *errmsg = "";
    int inShift = 0;
2251 2252 2253 2254
    Py_UNICODE *shiftOutStart;
    unsigned int base64bits = 0;
    unsigned long base64buffer = 0;
    Py_UNICODE surrogate = 0;
2255 2256
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
2257 2258 2259 2260

    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;
2261 2262 2263
    if (size == 0) {
        if (consumed)
            *consumed = 0;
2264
        return (PyObject *)unicode;
2265
    }
2266 2267

    p = unicode->str;
2268
    shiftOutStart = p;
2269 2270 2271
    e = s + size;

    while (s < e) {
2272
        Py_UNICODE ch;
Benjamin Peterson's avatar
Benjamin Peterson committed
2273
      restart:
2274
        ch = (unsigned char) *s;
2275

2276 2277 2278 2279
        if (inShift) { /* in a base-64 section */
            if (IS_BASE64(ch)) { /* consume a base-64 character */
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
                base64bits += 6;
2280
                s++;
2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297
                if (base64bits >= 16) {
                    /* we have enough bits for a UTF-16 value */
                    Py_UNICODE outCh = (Py_UNICODE)
                                       (base64buffer >> (base64bits-16));
                    base64bits -= 16;
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
                    if (surrogate) {
                        /* expecting a second surrogate */
                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
#ifdef Py_UNICODE_WIDE
                            *p++ = (((surrogate & 0x3FF)<<10)
                                    | (outCh & 0x3FF)) + 0x10000;
#else
                            *p++ = surrogate;
                            *p++ = outCh;
#endif
                            surrogate = 0;
2298
                            continue;
2299 2300
                        }
                        else {
2301
                            *p++ = surrogate;
2302 2303 2304
                            surrogate = 0;
                        }
                    }
2305
                    if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2306 2307 2308 2309 2310 2311
                        /* first surrogate */
                        surrogate = outCh;
                    }
                    else {
                        *p++ = outCh;
                    }
2312
                }
2313 2314 2315 2316 2317
            }
            else { /* now leaving a base-64 section */
                inShift = 0;
                s++;
                if (surrogate) {
2318 2319
                    *p++ = surrogate;
                    surrogate = 0;
2320
                }
2321 2322 2323 2324 2325
                if (base64bits > 0) { /* left-over bits */
                    if (base64bits >= 6) {
                        /* We've seen at least one base-64 character */
                        errmsg = "partial character in shift sequence";
                        goto utf7Error;
2326
                    }
2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337
                    else {
                        /* Some bits remain; they should be zero */
                        if (base64buffer != 0) {
                            errmsg = "non-zero padding bits in shift sequence";
                            goto utf7Error;
                        }
                    }
                }
                if (ch != '-') {
                    /* '-' is absorbed; other terminating
                       characters are preserved */
2338 2339 2340 2341 2342
                    *p++ = ch;
                }
            }
        }
        else if ( ch == '+' ) {
2343
            startinpos = s-starts;
2344 2345
            s++; /* consume '+' */
            if (s < e && *s == '-') { /* '+-' encodes '+' */
2346 2347
                s++;
                *p++ = '+';
2348 2349
            }
            else { /* begin base64-encoded section */
2350
                inShift = 1;
2351 2352
                shiftOutStart = p;
                base64bits = 0;
2353 2354
            }
        }
2355 2356
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
            *p++ = ch;
2357 2358 2359
            s++;
        }
        else {
2360
            startinpos = s-starts;
2361
            s++;
2362 2363
            errmsg = "unexpected special character";
            goto utf7Error;
2364 2365
        }
        continue;
2366
utf7Error:
2367 2368 2369
        outpos = p-PyUnicode_AS_UNICODE(unicode);
        endinpos = s-starts;
        if (unicode_decode_call_errorhandler(
Benjamin Peterson's avatar
Benjamin Peterson committed
2370 2371 2372 2373 2374
                errors, &errorHandler,
                "utf7", errmsg,
                &starts, &e, &startinpos, &endinpos, &exc, &s,
                &unicode, &outpos, &p))
            goto onError;
2375 2376
    }

2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394
    /* end of string */

    if (inShift && !consumed) { /* in shift sequence, no more to follow */
        /* if we're in an inconsistent state, that's an error */
        if (surrogate ||
                (base64bits >= 6) ||
                (base64bits > 0 && base64buffer != 0)) {
            outpos = p-PyUnicode_AS_UNICODE(unicode);
            endinpos = size;
            if (unicode_decode_call_errorhandler(
                    errors, &errorHandler,
                    "utf7", "unterminated shift sequence",
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
                    &unicode, &outpos, &p))
                goto onError;
            if (s < e)
                goto restart;
        }
2395
    }
2396 2397

    /* return state */
2398
    if (consumed) {
2399 2400
        if (inShift) {
            p = shiftOutStart; /* back off output */
2401
            *consumed = startinpos;
2402 2403
        }
        else {
2404
            *consumed = s-starts;
2405
        }
2406
    }
2407

2408
    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2409 2410
        goto onError;

2411 2412
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
2413 2414
    return (PyObject *)unicode;

Benjamin Peterson's avatar
Benjamin Peterson committed
2415
  onError:
2416 2417
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
2418 2419 2420 2421 2422 2423
    Py_DECREF(unicode);
    return NULL;
}


PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
2424
                               Py_ssize_t size,
2425 2426
                               int base64SetO,
                               int base64WhiteSpace,
Benjamin Peterson's avatar
Benjamin Peterson committed
2427
                               const char *errors)
2428
{
2429
    PyObject *v;
2430
    /* It might be possible to tighten this worst case */
2431
    Py_ssize_t allocated = 8 * size;
2432
    int inShift = 0;
Martin v. Löwis's avatar
Martin v. Löwis committed
2433
    Py_ssize_t i = 0;
2434 2435
    unsigned int base64bits = 0;
    unsigned long base64buffer = 0;
2436 2437 2438 2439
    char * out;
    char * start;

    if (size == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
2440
        return PyBytes_FromStringAndSize(NULL, 0);
2441

2442
    if (allocated / 8 != size)
Neal Norwitz's avatar
Neal Norwitz committed
2443 2444
        return PyErr_NoMemory();

2445
    v = PyBytes_FromStringAndSize(NULL, allocated);
2446 2447 2448
    if (v == NULL)
        return NULL;

2449
    start = out = PyBytes_AS_STRING(v);
2450 2451 2452
    for (;i < size; ++i) {
        Py_UNICODE ch = s[i];

2453 2454 2455 2456 2457 2458 2459 2460 2461
        if (inShift) {
            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
                /* shifting out */
                if (base64bits) { /* output remaining bits */
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
                    base64buffer = 0;
                    base64bits = 0;
                }
                inShift = 0;
2462 2463
                /* Characters not in the BASE64 set implicitly unshift the sequence
                   so no '-' is required, except if the character is itself a '-' */
2464
                if (IS_BASE64(ch) || ch == '-') {
2465 2466 2467
                    *out++ = '-';
                }
                *out++ = (char) ch;
2468 2469 2470 2471 2472 2473 2474 2475
            }
            else {
                goto encode_char;
            }
        }
        else { /* not in a shift sequence */
            if (ch == '+') {
                *out++ = '+';
2476
                        *out++ = '-';
2477 2478 2479 2480 2481 2482 2483 2484
            }
            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
                *out++ = (char) ch;
            }
            else {
                *out++ = '+';
                inShift = 1;
                goto encode_char;
2485
            }
2486
        }
2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507
        continue;
encode_char:
#ifdef Py_UNICODE_WIDE
        if (ch >= 0x10000) {
            /* code first surrogate */
            base64bits += 16;
            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
            while (base64bits >= 6) {
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
                base64bits -= 6;
            }
            /* prepare second surrogate */
            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
        }
#endif
        base64bits += 16;
        base64buffer = (base64buffer << 16) | ch;
        while (base64bits >= 6) {
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
            base64bits -= 6;
        }
2508
    }
2509 2510 2511
    if (base64bits)
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
    if (inShift)
2512
        *out++ = '-';
2513 2514 2515
    if (_PyBytes_Resize(&v, out - start) < 0)
        return NULL;
    return v;
2516 2517
}

2518 2519 2520 2521 2522
#undef IS_BASE64
#undef FROM_BASE64
#undef TO_BASE64
#undef DECODE_DIRECT
#undef ENCODE_DIRECT
2523

2524 2525
/* --- UTF-8 Codec -------------------------------------------------------- */

2526
static
2527
char utf8_code_length[256] = {
2528 2529 2530
    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
       illegal prefix.  See RFC 3629 for details */
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2531
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2532
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2533 2534 2535 2536
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2537 2538
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2539 2540
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2541 2542 2543 2544 2545
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2546 2547 2548
};

PyObject *PyUnicode_DecodeUTF8(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
2549 2550
                               Py_ssize_t size,
                               const char *errors)
2551 2552 2553 2554
{
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
}

2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567
/* Mask to check or force alignment of a pointer to C 'long' boundaries */
#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)

/* Mask to quickly check whether a C 'long' contains a
   non-ASCII, UTF8-encoded char. */
#if (SIZEOF_LONG == 8)
# define ASCII_CHAR_MASK 0x8080808080808080L
#elif (SIZEOF_LONG == 4)
# define ASCII_CHAR_MASK 0x80808080L
#else
# error C 'long' size should be either 4 or 8!
#endif

2568
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
2569 2570 2571
                                       Py_ssize_t size,
                                       const char *errors,
                                       Py_ssize_t *consumed)
2572
{
2573
    const char *starts = s;
2574
    int n;
2575
    int k;
Martin v. Löwis's avatar
Martin v. Löwis committed
2576 2577 2578
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
2579
    const char *e, *aligned_end;
2580 2581
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
2582
    const char *errmsg = "";
2583 2584
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
2585 2586 2587 2588 2589 2590

    /* Note: size will always be longer than the resulting Unicode
       character count */
    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;
2591 2592 2593
    if (size == 0) {
        if (consumed)
            *consumed = 0;
2594
        return (PyObject *)unicode;
2595
    }
2596 2597 2598 2599

    /* Unpack UTF-8 encoded data */
    p = unicode->str;
    e = s + size;
2600
    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2601 2602

    while (s < e) {
2603
        Py_UCS4 ch = (unsigned char)*s;
2604

2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644
        if (ch < 0x80) {
            /* Fast path for runs of ASCII characters. Given that common UTF-8
               input will consist of an overwhelming majority of ASCII
               characters, we try to optimize for this case by checking
               as many characters as a C 'long' can contain.
               First, check if we can do an aligned read, as most CPUs have
               a penalty for unaligned reads.
            */
            if (!((size_t) s & LONG_PTR_MASK)) {
                /* Help register allocation */
                register const char *_s = s;
                register Py_UNICODE *_p = p;
                while (_s < aligned_end) {
                    /* Read a whole long at a time (either 4 or 8 bytes),
                       and do a fast unrolled copy if it only contains ASCII
                       characters. */
                    unsigned long data = *(unsigned long *) _s;
                    if (data & ASCII_CHAR_MASK)
                        break;
                    _p[0] = (unsigned char) _s[0];
                    _p[1] = (unsigned char) _s[1];
                    _p[2] = (unsigned char) _s[2];
                    _p[3] = (unsigned char) _s[3];
#if (SIZEOF_LONG == 8)
                    _p[4] = (unsigned char) _s[4];
                    _p[5] = (unsigned char) _s[5];
                    _p[6] = (unsigned char) _s[6];
                    _p[7] = (unsigned char) _s[7];
#endif
                    _s += SIZEOF_LONG;
                    _p += SIZEOF_LONG;
                }
                s = _s;
                p = _p;
                if (s == e)
                    break;
                ch = (unsigned char)*s;
            }
        }

2645
        if (ch < 0x80) {
2646
            *p++ = (Py_UNICODE)ch;
2647 2648 2649 2650 2651 2652
            s++;
            continue;
        }

        n = utf8_code_length[ch];

2653
        if (s + n > e) {
Benjamin Peterson's avatar
Benjamin Peterson committed
2654 2655 2656 2657 2658
            if (consumed)
                break;
            else {
                errmsg = "unexpected end of data";
                startinpos = s-starts;
2659 2660 2661
                endinpos = startinpos+1;
                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
                    endinpos++;
Benjamin Peterson's avatar
Benjamin Peterson committed
2662 2663
                goto utf8Error;
            }
2664
        }
2665 2666 2667 2668

        switch (n) {

        case 0:
2669
            errmsg = "invalid start byte";
Benjamin Peterson's avatar
Benjamin Peterson committed
2670 2671 2672
            startinpos = s-starts;
            endinpos = startinpos+1;
            goto utf8Error;
2673 2674

        case 1:
2675
            errmsg = "internal error";
Benjamin Peterson's avatar
Benjamin Peterson committed
2676 2677 2678
            startinpos = s-starts;
            endinpos = startinpos+1;
            goto utf8Error;
2679 2680

        case 2:
2681
            if ((s[1] & 0xc0) != 0x80) {
2682
                errmsg = "invalid continuation byte";
Benjamin Peterson's avatar
Benjamin Peterson committed
2683
                startinpos = s-starts;
2684
                endinpos = startinpos + 1;
Benjamin Peterson's avatar
Benjamin Peterson committed
2685 2686
                goto utf8Error;
            }
2687
            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2688 2689
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            *p++ = (Py_UNICODE)ch;
2690 2691 2692
            break;

        case 3:
2693 2694 2695 2696 2697
            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
               will result in surrogates in range d800-dfff. Surrogates are
               not valid UTF-8 so they are rejected.
               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2698
            if ((s[1] & 0xc0) != 0x80 ||
2699 2700 2701 2702 2703 2704
                (s[2] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xE0 &&
                 (unsigned char)s[1] < 0xA0) ||
                ((unsigned char)s[0] == 0xED &&
                 (unsigned char)s[1] > 0x9F)) {
                errmsg = "invalid continuation byte";
Benjamin Peterson's avatar
Benjamin Peterson committed
2705
                startinpos = s-starts;
2706 2707 2708 2709 2710 2711 2712 2713
                endinpos = startinpos + 1;

                /* if s[1] first two bits are 1 and 0, then the invalid
                   continuation byte is s[2], so increment endinpos by 1,
                   if not, s[1] is invalid and endinpos doesn't need to
                   be incremented. */
                if ((s[1] & 0xC0) == 0x80)
                    endinpos++;
Benjamin Peterson's avatar
Benjamin Peterson committed
2714 2715
                goto utf8Error;
            }
2716
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2717 2718
            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
            *p++ = (Py_UNICODE)ch;
2719 2720 2721 2722 2723
            break;

        case 4:
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
2724 2725 2726 2727 2728 2729
                (s[3] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xF0 &&
                 (unsigned char)s[1] < 0x90) ||
                ((unsigned char)s[0] == 0xF4 &&
                 (unsigned char)s[1] > 0x8F)) {
                errmsg = "invalid continuation byte";
Benjamin Peterson's avatar
Benjamin Peterson committed
2730
                startinpos = s-starts;
2731 2732 2733 2734 2735 2736
                endinpos = startinpos + 1;
                if ((s[1] & 0xC0) == 0x80) {
                    endinpos++;
                    if ((s[2] & 0xC0) == 0x80)
                        endinpos++;
                }
Benjamin Peterson's avatar
Benjamin Peterson committed
2737 2738
                goto utf8Error;
            }
2739
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2740 2741 2742
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));

2743
#ifdef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
2744
            *p++ = (Py_UNICODE)ch;
2745
#else
2746
            /*  compute and append the two surrogates: */
2747

2748 2749
            /*  translate from 10000..10FFFF to 0..FFFF */
            ch -= 0x10000;
2750

2751 2752
            /*  high surrogate = top 10 bits added to D800 */
            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2753

2754
            /*  low surrogate = bottom 10 bits added to DC00 */
2755
            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2756
#endif
2757 2758 2759
            break;
        }
        s += n;
Benjamin Peterson's avatar
Benjamin Peterson committed
2760 2761 2762 2763 2764 2765
        continue;

      utf8Error:
        outpos = p-PyUnicode_AS_UNICODE(unicode);
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
2766
                "utf-8", errmsg,
Benjamin Peterson's avatar
Benjamin Peterson committed
2767 2768 2769 2770
                &starts, &e, &startinpos, &endinpos, &exc, &s,
                &unicode, &outpos, &p))
            goto onError;
        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2771
    }
2772
    if (consumed)
Benjamin Peterson's avatar
Benjamin Peterson committed
2773
        *consumed = s-starts;
2774 2775

    /* Adjust length */
2776
    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2777 2778
        goto onError;

2779 2780
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
2781 2782
    return (PyObject *)unicode;

Benjamin Peterson's avatar
Benjamin Peterson committed
2783
  onError:
2784 2785
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
2786 2787 2788 2789
    Py_DECREF(unicode);
    return NULL;
}

2790 2791
#undef ASCII_CHAR_MASK

2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905
#ifdef __APPLE__

/* Simplified UTF-8 decoder using surrogateescape error handler,
   used to decode the command line arguments on Mac OS X. */

wchar_t*
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
{
    int n;
    const char *e;
    wchar_t *unicode, *p;

    /* Note: size will always be longer than the resulting Unicode
       character count */
    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
        PyErr_NoMemory();
        return NULL;
    }
    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
    if (!unicode)
        return NULL;

    /* Unpack UTF-8 encoded data */
    p = unicode;
    e = s + size;
    while (s < e) {
        Py_UCS4 ch = (unsigned char)*s;

        if (ch < 0x80) {
            *p++ = (wchar_t)ch;
            s++;
            continue;
        }

        n = utf8_code_length[ch];
        if (s + n > e) {
            goto surrogateescape;
        }

        switch (n) {
        case 0:
        case 1:
            goto surrogateescape;

        case 2:
            if ((s[1] & 0xc0) != 0x80)
                goto surrogateescape;
            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            *p++ = (wchar_t)ch;
            break;

        case 3:
            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
               will result in surrogates in range d800-dfff. Surrogates are
               not valid UTF-8 so they are rejected.
               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xE0 &&
                 (unsigned char)s[1] < 0xA0) ||
                ((unsigned char)s[0] == 0xED &&
                 (unsigned char)s[1] > 0x9F)) {

                goto surrogateescape;
            }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
            *p++ = (Py_UNICODE)ch;
            break;

        case 4:
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                (s[3] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xF0 &&
                 (unsigned char)s[1] < 0x90) ||
                ((unsigned char)s[0] == 0xF4 &&
                 (unsigned char)s[1] > 0x8F)) {
                goto surrogateescape;
            }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));

#if SIZEOF_WCHAR_T == 4
            *p++ = (wchar_t)ch;
#else
            /*  compute and append the two surrogates: */

            /*  translate from 10000..10FFFF to 0..FFFF */
            ch -= 0x10000;

            /*  high surrogate = top 10 bits added to D800 */
            *p++ = (wchar_t)(0xD800 + (ch >> 10));

            /*  low surrogate = bottom 10 bits added to DC00 */
            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
#endif
            break;
        }
        s += n;
        continue;

      surrogateescape:
        *p++ = 0xDC00 + ch;
        s++;
    }
    *p = L'\0';
    return unicode;
}

#endif /* __APPLE__ */
2906

2907 2908 2909 2910
/* Allocation strategy:  if the string is short, convert into a stack buffer
   and allocate exactly as much space needed at the end.  Else allocate the
   maximum possible needed (4 result bytes per Unicode character), and return
   the excess memory at the end.
Martin v. Löwis's avatar
Martin v. Löwis committed
2911
*/
2912 2913
PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
2914 2915
                     Py_ssize_t size,
                     const char *errors)
2916
{
2917 2918
#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */

2919 2920 2921 2922 2923
    Py_ssize_t i;                /* index into s of next input byte */
    PyObject *result;            /* result string object */
    char *p;                     /* next free byte in output buffer */
    Py_ssize_t nallocated;      /* number of result bytes allocated */
    Py_ssize_t nneeded;            /* number of result bytes needed */
2924
    char stackbuf[MAX_SHORT_UNICHARS * 4];
2925 2926
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
2927 2928 2929 2930 2931 2932 2933 2934 2935 2936

    assert(s != NULL);
    assert(size >= 0);

    if (size <= MAX_SHORT_UNICHARS) {
        /* Write into the stack buffer; nallocated can't overflow.
         * At the end, we'll allocate exactly as much heap space as it
         * turns out we need.
         */
        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2937
        result = NULL;   /* will allocate after we're done */
2938 2939 2940 2941 2942 2943 2944
        p = stackbuf;
    }
    else {
        /* Overallocate on the heap, and give the excess back at the end. */
        nallocated = size * 4;
        if (nallocated / 4 != size)  /* overflow! */
            return PyErr_NoMemory();
2945
        result = PyBytes_FromStringAndSize(NULL, nallocated);
2946
        if (result == NULL)
2947
            return NULL;
2948
        p = PyBytes_AS_STRING(result);
2949
    }
Martin v. Löwis's avatar
Martin v. Löwis committed
2950

2951
    for (i = 0; i < size;) {
2952
        Py_UCS4 ch = s[i++];
2953

Martin v. Löwis's avatar
Martin v. Löwis committed
2954
        if (ch < 0x80)
2955
            /* Encode ASCII */
2956
            *p++ = (char) ch;
2957

2958
        else if (ch < 0x0800) {
2959
            /* Encode Latin-1 */
Marc-André Lemburg's avatar
Marc-André Lemburg committed
2960 2961
            *p++ = (char)(0xc0 | (ch >> 6));
            *p++ = (char)(0x80 | (ch & 0x3f));
2962
        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2963
#ifndef Py_UNICODE_WIDE
2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976
            /* Special case: check for high and low surrogate */
            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
                Py_UCS4 ch2 = s[i];
                /* Combine the two surrogates to form a UCS4 value */
                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
                i++;

                /* Encode UCS4 Unicode ordinals */
                *p++ = (char)(0xf0 | (ch >> 18));
                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
                *p++ = (char)(0x80 | (ch & 0x3f));
            } else {
2977
#endif
2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002
                Py_ssize_t newpos;
                PyObject *rep;
                Py_ssize_t repsize, k;
                rep = unicode_encode_call_errorhandler
                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
                     s, size, &exc, i-1, i, &newpos);
                if (!rep)
                    goto error;

                if (PyBytes_Check(rep))
                    repsize = PyBytes_GET_SIZE(rep);
                else
                    repsize = PyUnicode_GET_SIZE(rep);

                if (repsize > 4) {
                    Py_ssize_t offset;

                    if (result == NULL)
                        offset = p - stackbuf;
                    else
                        offset = p - PyBytes_AS_STRING(result);

                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
                        /* integer overflow */
                        PyErr_NoMemory();
3003 3004
                        goto error;
                    }
3005 3006 3007 3008 3009 3010 3011 3012 3013
                    nallocated += repsize - 4;
                    if (result != NULL) {
                        if (_PyBytes_Resize(&result, nallocated) < 0)
                            goto error;
                    } else {
                        result = PyBytes_FromStringAndSize(NULL, nallocated);
                        if (result == NULL)
                            goto error;
                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3014
                    }
3015 3016 3017 3018 3019 3020
                    p = PyBytes_AS_STRING(result) + offset;
                }

                if (PyBytes_Check(rep)) {
                    char *prep = PyBytes_AS_STRING(rep);
                    for(k = repsize; k > 0; k--)
3021
                        *p++ = *prep++;
3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034
                } else /* rep is unicode */ {
                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
                    Py_UNICODE c;

                    for(k=0; k<repsize; k++) {
                        c = prep[k];
                        if (0x80 <= c) {
                            raise_encode_exception(&exc, "utf-8", s, size,
                                                   i-1, i, "surrogates not allowed");
                            goto error;
                        }
                        *p++ = (char)prep[k];
                    }
3035
                }
3036
                Py_DECREF(rep);
3037
#ifndef Py_UNICODE_WIDE
3038
            }
3039
#endif
3040 3041 3042 3043 3044
        } else if (ch < 0x10000) {
            *p++ = (char)(0xe0 | (ch >> 12));
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *p++ = (char)(0x80 | (ch & 0x3f));
        } else /* ch >= 0x10000 */ {
3045 3046 3047 3048 3049 3050
            /* Encode UCS4 Unicode ordinals */
            *p++ = (char)(0xf0 | (ch >> 18));
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *p++ = (char)(0x80 | (ch & 0x3f));
        }
3051
    }
3052

3053
    if (result == NULL) {
3054
        /* This was stack allocated. */
3055
        nneeded = p - stackbuf;
3056
        assert(nneeded <= nallocated);
3057
        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
3058 3059
    }
    else {
3060
        /* Cut back to size actually needed. */
3061
        nneeded = p - PyBytes_AS_STRING(result);
3062
        assert(nneeded <= nallocated);
3063
        _PyBytes_Resize(&result, nneeded);
3064
    }
3065 3066
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
3067
    return result;
3068 3069 3070 3071 3072
 error:
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    Py_XDECREF(result);
    return NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
3073

3074
#undef MAX_SHORT_UNICHARS
3075 3076 3077 3078 3079 3080 3081 3082
}

PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
3083
    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson's avatar
Benjamin Peterson committed
3084 3085
                                PyUnicode_GET_SIZE(unicode),
                                NULL);
3086 3087
}

3088 3089 3090 3091
/* --- UTF-32 Codec ------------------------------------------------------- */

PyObject *
PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3092 3093 3094
                      Py_ssize_t size,
                      const char *errors,
                      int *byteorder)
3095 3096 3097 3098 3099 3100
{
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
}

PyObject *
PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3101 3102 3103 3104
                              Py_ssize_t size,
                              const char *errors,
                              int *byteorder,
                              Py_ssize_t *consumed)
3105 3106 3107 3108 3109 3110 3111 3112
{
    const char *starts = s;
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
#ifndef Py_UNICODE_WIDE
3113
    int pairs = 0;
3114
    const unsigned char *qq;
3115 3116 3117
#else
    const int pairs = 0;
#endif
3118
    const unsigned char *q, *e;
3119 3120 3121 3122 3123 3124 3125 3126 3127 3128
    int bo = 0;       /* assume native ordering by default */
    const char *errmsg = "";
    /* Offsets from q for retrieving bytes in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int iorder[] = {0, 1, 2, 3};
#else
    int iorder[] = {3, 2, 1, 0};
#endif
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
3129

3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142
    q = (unsigned char *)s;
    e = q + size;

    if (byteorder)
        bo = *byteorder;

    /* Check for BOM marks (U+FEFF) in the input and adjust current
       byte order setting accordingly. In native mode, the leading BOM
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
    if (bo == 0) {
        if (size >= 4) {
            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson's avatar
Benjamin Peterson committed
3143
                (q[iorder[1]] << 8) | q[iorder[0]];
3144
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson's avatar
Benjamin Peterson committed
3145 3146 3147 3148 3149 3150 3151 3152
            if (bom == 0x0000FEFF) {
                q += 4;
                bo = -1;
            }
            else if (bom == 0xFFFE0000) {
                q += 4;
                bo = 1;
            }
3153
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
3154 3155 3156 3157 3158 3159 3160 3161
            if (bom == 0x0000FEFF) {
                q += 4;
                bo = 1;
            }
            else if (bom == 0xFFFE0000) {
                q += 4;
                bo = -1;
            }
3162
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
3163
        }
3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180
    }

    if (bo == -1) {
        /* force LE */
        iorder[0] = 0;
        iorder[1] = 1;
        iorder[2] = 2;
        iorder[3] = 3;
    }
    else if (bo == 1) {
        /* force BE */
        iorder[0] = 3;
        iorder[1] = 2;
        iorder[2] = 1;
        iorder[3] = 0;
    }

3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198
    /* On narrow builds we split characters outside the BMP into two
       codepoints => count how much extra space we need. */
#ifndef Py_UNICODE_WIDE
    for (qq = q; qq < e; qq += 4)
        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
            pairs++;
#endif

    /* This might be one to much, because of a BOM */
    unicode = _PyUnicode_New((size+3)/4+pairs);
    if (!unicode)
        return NULL;
    if (size == 0)
        return (PyObject *)unicode;

    /* Unpack UTF-32 encoded data */
    p = unicode->str;

3199
    while (q < e) {
Benjamin Peterson's avatar
Benjamin Peterson committed
3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213
        Py_UCS4 ch;
        /* remaining bytes at the end? (size should be divisible by 4) */
        if (e-q<4) {
            if (consumed)
                break;
            errmsg = "truncated data";
            startinpos = ((const char *)q)-starts;
            endinpos = ((const char *)e)-starts;
            goto utf32Error;
            /* The remaining input chars are ignored if the callback
               chooses to skip the input */
        }
        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
            (q[iorder[1]] << 8) | q[iorder[0]];
3214

Benjamin Peterson's avatar
Benjamin Peterson committed
3215 3216 3217 3218 3219 3220 3221
        if (ch >= 0x110000)
        {
            errmsg = "codepoint not in range(0x110000)";
            startinpos = ((const char *)q)-starts;
            endinpos = startinpos+4;
            goto utf32Error;
        }
3222
#ifndef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
3223 3224 3225 3226 3227 3228
        if (ch >= 0x10000)
        {
            *p++ = 0xD800 | ((ch-0x10000) >> 10);
            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
        }
        else
3229
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240
            *p++ = ch;
        q += 4;
        continue;
      utf32Error:
        outpos = p-PyUnicode_AS_UNICODE(unicode);
        if (unicode_decode_call_errorhandler(
                errors, &errorHandler,
                "utf32", errmsg,
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
                &unicode, &outpos, &p))
            goto onError;
3241 3242 3243 3244 3245 3246
    }

    if (byteorder)
        *byteorder = bo;

    if (consumed)
Benjamin Peterson's avatar
Benjamin Peterson committed
3247
        *consumed = (const char *)q-starts;
3248 3249 3250 3251 3252 3253 3254 3255 3256

    /* Adjust length */
    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
        goto onError;

    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return (PyObject *)unicode;

Benjamin Peterson's avatar
Benjamin Peterson committed
3257
  onError:
3258 3259 3260 3261 3262 3263 3264 3265
    Py_DECREF(unicode);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return NULL;
}

PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3266 3267 3268
                      Py_ssize_t size,
                      const char *errors,
                      int byteorder)
3269
{
3270
    PyObject *v;
3271
    unsigned char *p;
Neal Norwitz's avatar
Neal Norwitz committed
3272
    Py_ssize_t nsize, bytesize;
3273
#ifndef Py_UNICODE_WIDE
Neal Norwitz's avatar
Neal Norwitz committed
3274
    Py_ssize_t i, pairs;
3275 3276 3277 3278 3279 3280 3281 3282 3283 3284
#else
    const int pairs = 0;
#endif
    /* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int iorder[] = {0, 1, 2, 3};
#else
    int iorder[] = {3, 2, 1, 0};
#endif

Benjamin Peterson's avatar
Benjamin Peterson committed
3285 3286 3287 3288 3289 3290 3291
#define STORECHAR(CH)                           \
    do {                                        \
        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
        p[iorder[0]] = (CH) & 0xff;             \
        p += 4;                                 \
3292 3293 3294 3295 3296 3297
    } while(0)

    /* In narrow builds we can output surrogate pairs as one codepoint,
       so we need less space. */
#ifndef Py_UNICODE_WIDE
    for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson's avatar
Benjamin Peterson committed
3298 3299 3300
        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
            pairs++;
3301
#endif
Neal Norwitz's avatar
Neal Norwitz committed
3302 3303 3304
    nsize = (size - pairs + (byteorder == 0));
    bytesize = nsize * 4;
    if (bytesize / 4 != nsize)
Benjamin Peterson's avatar
Benjamin Peterson committed
3305
        return PyErr_NoMemory();
3306
    v = PyBytes_FromStringAndSize(NULL, bytesize);
3307 3308 3309
    if (v == NULL)
        return NULL;

3310
    p = (unsigned char *)PyBytes_AS_STRING(v);
3311
    if (byteorder == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
3312
        STORECHAR(0xFEFF);
3313
    if (size == 0)
3314
        goto done;
3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331

    if (byteorder == -1) {
        /* force LE */
        iorder[0] = 0;
        iorder[1] = 1;
        iorder[2] = 2;
        iorder[3] = 3;
    }
    else if (byteorder == 1) {
        /* force BE */
        iorder[0] = 3;
        iorder[1] = 2;
        iorder[2] = 1;
        iorder[3] = 0;
    }

    while (size-- > 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
3332
        Py_UCS4 ch = *s++;
3333
#ifndef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
3334 3335 3336 3337 3338 3339 3340
        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
            Py_UCS4 ch2 = *s;
            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
                s++;
                size--;
            }
3341
        }
3342 3343 3344
#endif
        STORECHAR(ch);
    }
3345 3346

  done:
3347
    return v;
3348 3349 3350 3351 3352 3353 3354 3355 3356 3357
#undef STORECHAR
}

PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson's avatar
Benjamin Peterson committed
3358 3359 3360
                                 PyUnicode_GET_SIZE(unicode),
                                 NULL,
                                 0);
3361 3362
}

3363 3364
/* --- UTF-16 Codec ------------------------------------------------------- */

3365 3366
PyObject *
PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3367 3368 3369
                      Py_ssize_t size,
                      const char *errors,
                      int *byteorder)
3370 3371 3372 3373
{
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
}

3374 3375 3376 3377 3378 3379
/* Two masks for fast checking of whether a C 'long' may contain
   UTF16-encoded surrogate characters. This is an efficient heuristic,
   assuming that non-surrogate characters with a code point >= 0x8000 are
   rare in most input.
   FAST_CHAR_MASK is used when the input is in native byte ordering,
   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson's avatar
Benjamin Peterson committed
3380
*/
3381 3382 3383 3384 3385 3386 3387 3388 3389 3390
#if (SIZEOF_LONG == 8)
# define FAST_CHAR_MASK         0x8000800080008000L
# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
#elif (SIZEOF_LONG == 4)
# define FAST_CHAR_MASK         0x80008000L
# define SWAPPED_FAST_CHAR_MASK 0x00800080L
#else
# error C 'long' size should be either 4 or 8!
#endif

3391 3392
PyObject *
PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3393 3394 3395 3396
                              Py_ssize_t size,
                              const char *errors,
                              int *byteorder,
                              Py_ssize_t *consumed)
3397
{
3398
    const char *starts = s;
Martin v. Löwis's avatar
Martin v. Löwis committed
3399 3400 3401
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
3402 3403
    PyUnicodeObject *unicode;
    Py_UNICODE *p;
3404
    const unsigned char *q, *e, *aligned_end;
3405
    int bo = 0;       /* assume native ordering by default */
3406
    int native_ordering = 0;
3407
    const char *errmsg = "";
3408 3409 3410 3411 3412 3413
    /* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int ihi = 1, ilo = 0;
#else
    int ihi = 0, ilo = 1;
#endif
3414 3415
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426

    /* Note: size will always be longer than the resulting Unicode
       character count */
    unicode = _PyUnicode_New(size);
    if (!unicode)
        return NULL;
    if (size == 0)
        return (PyObject *)unicode;

    /* Unpack UTF-16 encoded data */
    p = unicode->str;
3427
    q = (unsigned char *)s;
3428
    e = q + size;
3429 3430

    if (byteorder)
3431
        bo = *byteorder;
3432

3433 3434 3435 3436 3437
    /* Check for BOM marks (U+FEFF) in the input and adjust current
       byte order setting accordingly. In native mode, the leading BOM
       mark is skipped, in all other modes, it is copied to the output
       stream as-is (giving a ZWNBSP character). */
    if (bo == 0) {
3438 3439
        if (size >= 2) {
            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3440
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson's avatar
Benjamin Peterson committed
3441 3442 3443 3444 3445 3446 3447 3448
            if (bom == 0xFEFF) {
                q += 2;
                bo = -1;
            }
            else if (bom == 0xFFFE) {
                q += 2;
                bo = 1;
            }
3449
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
3450 3451 3452 3453 3454 3455 3456 3457
            if (bom == 0xFEFF) {
                q += 2;
                bo = 1;
            }
            else if (bom == 0xFFFE) {
                q += 2;
                bo = -1;
            }
3458
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
3459
        }
3460
    }
3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471

    if (bo == -1) {
        /* force LE */
        ihi = 1;
        ilo = 0;
    }
    else if (bo == 1) {
        /* force BE */
        ihi = 0;
        ilo = 1;
    }
3472 3473 3474 3475 3476
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
    native_ordering = ilo < ihi;
#else
    native_ordering = ilo > ihi;
#endif
3477

3478
    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3479
    while (1) {
Benjamin Peterson's avatar
Benjamin Peterson committed
3480
        Py_UNICODE ch;
3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492
        if (e - q < 2) {
            /* remaining byte at the end? (size should be even) */
            if (q == e || consumed)
                break;
            errmsg = "truncated data";
            startinpos = ((const char *)q) - starts;
            endinpos = ((const char *)e) - starts;
            outpos = p - PyUnicode_AS_UNICODE(unicode);
            goto utf16Error;
            /* The remaining input chars are ignored if the callback
               chooses to skip the input */
        }
3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536
        /* First check for possible aligned read of a C 'long'. Unaligned
           reads are more expensive, better to defer to another iteration. */
        if (!((size_t) q & LONG_PTR_MASK)) {
            /* Fast path for runs of non-surrogate chars. */
            register const unsigned char *_q = q;
            Py_UNICODE *_p = p;
            if (native_ordering) {
                /* Native ordering is simple: as long as the input cannot
                   possibly contain a surrogate char, do an unrolled copy
                   of several 16-bit code points to the target object.
                   The non-surrogate check is done on several input bytes
                   at a time (as many as a C 'long' can contain). */
                while (_q < aligned_end) {
                    unsigned long data = * (unsigned long *) _q;
                    if (data & FAST_CHAR_MASK)
                        break;
                    _p[0] = ((unsigned short *) _q)[0];
                    _p[1] = ((unsigned short *) _q)[1];
#if (SIZEOF_LONG == 8)
                    _p[2] = ((unsigned short *) _q)[2];
                    _p[3] = ((unsigned short *) _q)[3];
#endif
                    _q += SIZEOF_LONG;
                    _p += SIZEOF_LONG / 2;
                }
            }
            else {
                /* Byteswapped ordering is similar, but we must decompose
                   the copy bytewise, and take care of zero'ing out the
                   upper bytes if the target object is in 32-bit units
                   (that is, in UCS-4 builds). */
                while (_q < aligned_end) {
                    unsigned long data = * (unsigned long *) _q;
                    if (data & SWAPPED_FAST_CHAR_MASK)
                        break;
                    /* Zero upper bytes in UCS-4 builds */
#if (Py_UNICODE_SIZE > 2)
                    _p[0] = 0;
                    _p[1] = 0;
#if (SIZEOF_LONG == 8)
                    _p[2] = 0;
                    _p[3] = 0;
#endif
#endif
3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547
                    /* Issue #4916; UCS-4 builds on big endian machines must
                       fill the two last bytes of each 4-byte unit. */
#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
# define OFF 2
#else
# define OFF 0
#endif
                    ((unsigned char *) _p)[OFF + 1] = _q[0];
                    ((unsigned char *) _p)[OFF + 0] = _q[1];
                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3548
#if (SIZEOF_LONG == 8)
3549 3550 3551 3552
                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3553
#endif
3554
#undef OFF
3555 3556 3557 3558 3559 3560
                    _q += SIZEOF_LONG;
                    _p += SIZEOF_LONG / 2;
                }
            }
            p = _p;
            q = _q;
3561 3562
            if (e - q < 2)
                continue;
3563
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
3564
        ch = (q[ihi] << 8) | q[ilo];
3565

Benjamin Peterson's avatar
Benjamin Peterson committed
3566
        q += 2;
3567

Benjamin Peterson's avatar
Benjamin Peterson committed
3568 3569 3570 3571
        if (ch < 0xD800 || ch > 0xDFFF) {
            *p++ = ch;
            continue;
        }
3572

Benjamin Peterson's avatar
Benjamin Peterson committed
3573
        /* UTF-16 code pair: */
3574
        if (e - q < 2) {
Benjamin Peterson's avatar
Benjamin Peterson committed
3575 3576
            errmsg = "unexpected end of data";
            startinpos = (((const char *)q) - 2) - starts;
3577
            endinpos = ((const char *)e) - starts;
Benjamin Peterson's avatar
Benjamin Peterson committed
3578 3579 3580 3581 3582 3583
            goto utf16Error;
        }
        if (0xD800 <= ch && ch <= 0xDBFF) {
            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
            q += 2;
            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3584
#ifndef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
3585 3586
                *p++ = ch;
                *p++ = ch2;
3587
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
3588
                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3589
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
3590 3591 3592
                continue;
            }
            else {
3593
                errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson's avatar
Benjamin Peterson committed
3594 3595 3596 3597 3598
                startinpos = (((const char *)q)-4)-starts;
                endinpos = startinpos+2;
                goto utf16Error;
            }

3599
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
3600 3601 3602 3603
        errmsg = "illegal encoding";
        startinpos = (((const char *)q)-2)-starts;
        endinpos = startinpos+2;
        /* Fall through to report the error */
3604

Benjamin Peterson's avatar
Benjamin Peterson committed
3605 3606 3607
      utf16Error:
        outpos = p - PyUnicode_AS_UNICODE(unicode);
        if (unicode_decode_call_errorhandler(
3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619
                errors,
                &errorHandler,
                "utf16", errmsg,
                &starts,
                (const char **)&e,
                &startinpos,
                &endinpos,
                &exc,
                (const char **)&q,
                &unicode,
                &outpos,
                &p))
Benjamin Peterson's avatar
Benjamin Peterson committed
3620
            goto onError;
3621 3622 3623
        /* Update data because unicode_decode_call_errorhandler might have
           changed the input object. */
        aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3624
    }
3625 3626 3627 3628

    if (byteorder)
        *byteorder = bo;

3629
    if (consumed)
Benjamin Peterson's avatar
Benjamin Peterson committed
3630
        *consumed = (const char *)q-starts;
3631

3632
    /* Adjust length */
3633
    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3634 3635
        goto onError;

3636 3637
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
3638 3639
    return (PyObject *)unicode;

Benjamin Peterson's avatar
Benjamin Peterson committed
3640
  onError:
3641
    Py_DECREF(unicode);
3642 3643
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
3644 3645 3646
    return NULL;
}

3647 3648 3649
#undef FAST_CHAR_MASK
#undef SWAPPED_FAST_CHAR_MASK

3650 3651
PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3652 3653 3654
                      Py_ssize_t size,
                      const char *errors,
                      int byteorder)
3655
{
3656
    PyObject *v;
3657
    unsigned char *p;
Neal Norwitz's avatar
Neal Norwitz committed
3658
    Py_ssize_t nsize, bytesize;
3659
#ifdef Py_UNICODE_WIDE
Neal Norwitz's avatar
Neal Norwitz committed
3660
    Py_ssize_t i, pairs;
3661 3662 3663
#else
    const int pairs = 0;
#endif
3664 3665 3666 3667 3668 3669 3670
    /* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int ihi = 1, ilo = 0;
#else
    int ihi = 0, ilo = 1;
#endif

Benjamin Peterson's avatar
Benjamin Peterson committed
3671 3672 3673 3674 3675
#define STORECHAR(CH)                           \
    do {                                        \
        p[ihi] = ((CH) >> 8) & 0xff;            \
        p[ilo] = (CH) & 0xff;                   \
        p += 2;                                 \
3676
    } while(0)
3677

3678
#ifdef Py_UNICODE_WIDE
3679
    for (i = pairs = 0; i < size; i++)
Benjamin Peterson's avatar
Benjamin Peterson committed
3680 3681
        if (s[i] >= 0x10000)
            pairs++;
3682
#endif
Neal Norwitz's avatar
Neal Norwitz committed
3683 3684 3685
    /* 2 * (size + pairs + (byteorder == 0)) */
    if (size > PY_SSIZE_T_MAX ||
        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson's avatar
Benjamin Peterson committed
3686
        return PyErr_NoMemory();
Neal Norwitz's avatar
Neal Norwitz committed
3687 3688 3689
    nsize = size + pairs + (byteorder == 0);
    bytesize = nsize * 2;
    if (bytesize / 2 != nsize)
Benjamin Peterson's avatar
Benjamin Peterson committed
3690
        return PyErr_NoMemory();
3691
    v = PyBytes_FromStringAndSize(NULL, bytesize);
3692 3693 3694
    if (v == NULL)
        return NULL;

3695
    p = (unsigned char *)PyBytes_AS_STRING(v);
3696
    if (byteorder == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
3697
        STORECHAR(0xFEFF);
3698
    if (size == 0)
3699
        goto done;
3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711

    if (byteorder == -1) {
        /* force LE */
        ihi = 1;
        ilo = 0;
    }
    else if (byteorder == 1) {
        /* force BE */
        ihi = 0;
        ilo = 1;
    }

3712
    while (size-- > 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
3713 3714
        Py_UNICODE ch = *s++;
        Py_UNICODE ch2 = 0;
3715
#ifdef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
3716 3717 3718 3719
        if (ch >= 0x10000) {
            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
            ch  = 0xD800 | ((ch-0x10000) >> 10);
        }
3720
#endif
3721 3722 3723
        STORECHAR(ch);
        if (ch2)
            STORECHAR(ch2);
3724
    }
3725 3726

  done:
3727
    return v;
3728
#undef STORECHAR
3729 3730 3731 3732 3733 3734 3735 3736 3737
}

PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson's avatar
Benjamin Peterson committed
3738 3739 3740
                                 PyUnicode_GET_SIZE(unicode),
                                 NULL,
                                 0);
3741 3742 3743 3744
}

/* --- Unicode Escape Codec ----------------------------------------------- */

3745
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3746

3747
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3748 3749
                                        Py_ssize_t size,
                                        const char *errors)
3750
{
3751
    const char *starts = s;
Martin v. Löwis's avatar
Martin v. Löwis committed
3752 3753 3754
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
3755
    int i;
3756
    PyUnicodeObject *v;
3757
    Py_UNICODE *p;
3758
    const char *end;
3759 3760
    char* message;
    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3761 3762
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
3763

3764 3765
    /* Escaped strings will always be longer than the resulting
       Unicode string, so we start with size here and then reduce the
3766 3767 3768
       length after conversion to the true value.
       (but if the error callback returns a long replacement string
       we'll have to allocate more space) */
3769 3770 3771 3772 3773
    v = _PyUnicode_New(size);
    if (v == NULL)
        goto onError;
    if (size == 0)
        return (PyObject *)v;
3774

3775
    p = PyUnicode_AS_UNICODE(v);
3776
    end = s + size;
3777

3778 3779
    while (s < end) {
        unsigned char c;
3780
        Py_UNICODE x;
3781
        int digits;
3782 3783 3784

        /* Non-escape characters are interpreted as Unicode ordinals */
        if (*s != '\\') {
3785
            *p++ = (unsigned char) *s++;
3786 3787 3788
            continue;
        }

3789
        startinpos = s-starts;
3790 3791
        /* \ - Escapes */
        s++;
3792 3793 3794 3795
        c = *s++;
        if (s > end)
            c = '\0'; /* Invalid after \ */
        switch (c) {
3796

Benjamin Peterson's avatar
Benjamin Peterson committed
3797
            /* \x escapes */
3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809
        case '\n': break;
        case '\\': *p++ = '\\'; break;
        case '\'': *p++ = '\''; break;
        case '\"': *p++ = '\"'; break;
        case 'b': *p++ = '\b'; break;
        case 'f': *p++ = '\014'; break; /* FF */
        case 't': *p++ = '\t'; break;
        case 'n': *p++ = '\n'; break;
        case 'r': *p++ = '\r'; break;
        case 'v': *p++ = '\013'; break; /* VT */
        case 'a': *p++ = '\007'; break; /* BEL, not classic C */

Benjamin Peterson's avatar
Benjamin Peterson committed
3810
            /* \OOO (octal) escapes */
3811 3812
        case '0': case '1': case '2': case '3':
        case '4': case '5': case '6': case '7':
Guido van Rossum's avatar
Guido van Rossum committed
3813
            x = s[-1] - '0';
3814
            if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum's avatar
Guido van Rossum committed
3815
                x = (x<<3) + *s++ - '0';
3816
                if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum's avatar
Guido van Rossum committed
3817
                    x = (x<<3) + *s++ - '0';
3818
            }
Guido van Rossum's avatar
Guido van Rossum committed
3819
            *p++ = x;
3820 3821
            break;

Benjamin Peterson's avatar
Benjamin Peterson committed
3822 3823
            /* hex escapes */
            /* \xXX */
3824
        case 'x':
3825 3826 3827
            digits = 2;
            message = "truncated \\xXX escape";
            goto hexescape;
3828

Benjamin Peterson's avatar
Benjamin Peterson committed
3829
            /* \uXXXX */
3830
        case 'u':
3831 3832 3833
            digits = 4;
            message = "truncated \\uXXXX escape";
            goto hexescape;
3834

Benjamin Peterson's avatar
Benjamin Peterson committed
3835
            /* \UXXXXXXXX */
3836
        case 'U':
3837 3838 3839 3840
            digits = 8;
            message = "truncated \\UXXXXXXXX escape";
        hexescape:
            chr = 0;
3841 3842 3843 3844
            outpos = p-PyUnicode_AS_UNICODE(v);
            if (s+digits>end) {
                endinpos = size;
                if (unicode_decode_call_errorhandler(
Benjamin Peterson's avatar
Benjamin Peterson committed
3845 3846 3847 3848
                        errors, &errorHandler,
                        "unicodeescape", "end of string in escape sequence",
                        &starts, &end, &startinpos, &endinpos, &exc, &s,
                        &v, &outpos, &p))
3849 3850 3851 3852
                    goto onError;
                goto nextByte;
            }
            for (i = 0; i < digits; ++i) {
3853
                c = (unsigned char) s[i];
3854
                if (!Py_ISXDIGIT(c)) {
3855 3856
                    endinpos = (s+i+1)-starts;
                    if (unicode_decode_call_errorhandler(
Benjamin Peterson's avatar
Benjamin Peterson committed
3857 3858 3859 3860
                            errors, &errorHandler,
                            "unicodeescape", message,
                            &starts, &end, &startinpos, &endinpos, &exc, &s,
                            &v, &outpos, &p))
3861
                        goto onError;
3862
                    goto nextByte;
3863 3864 3865 3866 3867 3868 3869 3870 3871 3872
                }
                chr = (chr<<4) & ~0xF;
                if (c >= '0' && c <= '9')
                    chr += c - '0';
                else if (c >= 'a' && c <= 'f')
                    chr += 10 + c - 'a';
                else
                    chr += 10 + c - 'A';
            }
            s += i;
3873
            if (chr == 0xffffffff && PyErr_Occurred())
3874 3875 3876
                /* _decoding_error will have already written into the
                   target buffer. */
                break;
3877 3878 3879 3880 3881 3882
        store:
            /* when we get here, chr is a 32-bit unicode character */
            if (chr <= 0xffff)
                /* UCS-2 character */
                *p++ = (Py_UNICODE) chr;
            else if (chr <= 0x10ffff) {
3883
                /* UCS-4 character. Either store directly, or as
Walter Dörwald's avatar
Walter Dörwald committed
3884
                   surrogate pair. */
3885
#ifdef Py_UNICODE_WIDE
3886 3887
                *p++ = chr;
#else
3888 3889
                chr -= 0x10000L;
                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3890
                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3891
#endif
3892
            } else {
3893 3894 3895
                endinpos = s-starts;
                outpos = p-PyUnicode_AS_UNICODE(v);
                if (unicode_decode_call_errorhandler(
Benjamin Peterson's avatar
Benjamin Peterson committed
3896 3897 3898 3899
                        errors, &errorHandler,
                        "unicodeescape", "illegal Unicode character",
                        &starts, &end, &startinpos, &endinpos, &exc, &s,
                        &v, &outpos, &p))
3900 3901 3902
                    goto onError;
            }
            break;
3903

Benjamin Peterson's avatar
Benjamin Peterson committed
3904
            /* \N{name} */
3905
        case 'N':
3906
            message = "malformed \\N character escape";
3907
            if (ucnhash_CAPI == NULL) {
3908
                /* load the unicode data module */
3909
                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3910
                if (ucnhash_CAPI == NULL)
3911
                    goto ucnhashError;
3912
            }
3913
            if (*s == '{') {
3914
                const char *start = s+1;
3915
                /* look for the closing brace */
3916 3917 3918 3919 3920 3921
                while (*s != '}' && s < end)
                    s++;
                if (s > start && s < end && *s == '}') {
                    /* found a name.  look it up in the unicode database */
                    message = "unknown Unicode character name";
                    s++;
3922
                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3923
                        goto store;
3924 3925
                }
            }
3926 3927 3928
            endinpos = s-starts;
            outpos = p-PyUnicode_AS_UNICODE(v);
            if (unicode_decode_call_errorhandler(
Benjamin Peterson's avatar
Benjamin Peterson committed
3929 3930 3931 3932
                    errors, &errorHandler,
                    "unicodeescape", message,
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
                    &v, &outpos, &p))
3933
                goto onError;
3934 3935 3936
            break;

        default:
Walter Dörwald's avatar
Walter Dörwald committed
3937
            if (s > end) {
3938 3939 3940 3941 3942
                message = "\\ at end of string";
                s--;
                endinpos = s-starts;
                outpos = p-PyUnicode_AS_UNICODE(v);
                if (unicode_decode_call_errorhandler(
Benjamin Peterson's avatar
Benjamin Peterson committed
3943 3944 3945 3946
                        errors, &errorHandler,
                        "unicodeescape", message,
                        &starts, &end, &startinpos, &endinpos, &exc, &s,
                        &v, &outpos, &p))
Walter Dörwald's avatar
Walter Dörwald committed
3947 3948 3949 3950 3951 3952
                    goto onError;
            }
            else {
                *p++ = '\\';
                *p++ = (unsigned char)s[-1];
            }
3953 3954
            break;
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
3955
      nextByte:
3956
        ;
3957
    }
3958
    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3959
        goto onError;
3960 3961
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
3962
    return (PyObject *)v;
Walter Dörwald's avatar
Walter Dörwald committed
3963

Benjamin Peterson's avatar
Benjamin Peterson committed
3964
  ucnhashError:
3965 3966 3967 3968
    PyErr_SetString(
        PyExc_UnicodeError,
        "\\N escapes not supported (can't load unicodedata module)"
        );
3969
    Py_XDECREF(v);
3970 3971
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
3972 3973
    return NULL;

Benjamin Peterson's avatar
Benjamin Peterson committed
3974
  onError:
3975
    Py_XDECREF(v);
3976 3977
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
3978 3979 3980 3981 3982 3983 3984 3985 3986 3987
    return NULL;
}

/* Return a Unicode-Escape string version of the Unicode object.

   If quotes is true, the string is enclosed in u"" or u'' quotes as
   appropriate.

*/

3988
Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
3989 3990
                                             Py_ssize_t size,
                                             Py_UNICODE ch)
3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001
{
    /* like wcschr, but doesn't stop at NULL characters */

    while (size-- > 0) {
        if (*s == ch)
            return s;
        s++;
    }

    return NULL;
}
4002

4003 4004 4005
static const char *hexdigits = "0123456789abcdef";

PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
4006
                                        Py_ssize_t size)
4007
{
4008
    PyObject *repr;
4009 4010
    char *p;

Neal Norwitz's avatar
Neal Norwitz committed
4011 4012 4013 4014 4015 4016
#ifdef Py_UNICODE_WIDE
    const Py_ssize_t expandsize = 10;
#else
    const Py_ssize_t expandsize = 6;
#endif

4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034
    /* XXX(nnorwitz): rather than over-allocating, it would be
       better to choose a different scheme.  Perhaps scan the
       first N-chars of the string and allocate based on that size.
    */
    /* Initial allocation is based on the longest-possible unichr
       escape.

       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
       unichr, so in this case it's the longest unichr escape. In
       narrow (UTF-16) builds this is five chars per source unichr
       since there are two unichrs in the surrogate pair, so in narrow
       (UTF-16) builds it's not the longest unichr escape.

       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
       so in the narrow (UTF-16) build case it's the longest unichr
       escape.
    */

4035 4036 4037
    if (size == 0)
        return PyBytes_FromStringAndSize(NULL, 0);

Neal Norwitz's avatar
Neal Norwitz committed
4038
    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson's avatar
Benjamin Peterson committed
4039
        return PyErr_NoMemory();
Neal Norwitz's avatar
Neal Norwitz committed
4040

4041
    repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson's avatar
Benjamin Peterson committed
4042 4043 4044
                                     2
                                     + expandsize*size
                                     + 1);
4045 4046 4047
    if (repr == NULL)
        return NULL;

4048
    p = PyBytes_AS_STRING(repr);
4049 4050 4051

    while (size-- > 0) {
        Py_UNICODE ch = *s++;
4052

4053 4054
        /* Escape backslashes */
        if (ch == '\\') {
4055 4056
            *p++ = '\\';
            *p++ = (char) ch;
4057
            continue;
4058
        }
4059

4060
#ifdef Py_UNICODE_WIDE
4061 4062 4063 4064
        /* Map 21-bit characters to '\U00xxxxxx' */
        else if (ch >= 0x10000) {
            *p++ = '\\';
            *p++ = 'U';
4065 4066 4067 4068 4069 4070 4071 4072
            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
            *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson's avatar
Benjamin Peterson committed
4073
            continue;
4074
        }
4075
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
4076 4077 4078 4079 4080 4081 4082
        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
        else if (ch >= 0xD800 && ch < 0xDC00) {
            Py_UNICODE ch2;
            Py_UCS4 ucs;

            ch2 = *s++;
            size--;
4083
            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099
                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
                *p++ = '\\';
                *p++ = 'U';
                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
                *p++ = hexdigits[ucs & 0x0000000F];
                continue;
            }
            /* Fall through: isolated surrogates are copied as-is */
            s--;
            size++;
4100
        }
4101
#endif
4102

4103
        /* Map 16-bit characters to '\uxxxx' */
4104
        if (ch >= 256) {
4105 4106
            *p++ = '\\';
            *p++ = 'u';
4107 4108 4109 4110
            *p++ = hexdigits[(ch >> 12) & 0x000F];
            *p++ = hexdigits[(ch >> 8) & 0x000F];
            *p++ = hexdigits[(ch >> 4) & 0x000F];
            *p++ = hexdigits[ch & 0x000F];
4111
        }
4112

4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125
        /* Map special whitespace to '\t', \n', '\r' */
        else if (ch == '\t') {
            *p++ = '\\';
            *p++ = 't';
        }
        else if (ch == '\n') {
            *p++ = '\\';
            *p++ = 'n';
        }
        else if (ch == '\r') {
            *p++ = '\\';
            *p++ = 'r';
        }
4126

4127
        /* Map non-printable US ASCII to '\xhh' */
4128
        else if (ch < ' ' || ch >= 0x7F) {
4129
            *p++ = '\\';
4130
            *p++ = 'x';
4131 4132
            *p++ = hexdigits[(ch >> 4) & 0x000F];
            *p++ = hexdigits[ch & 0x000F];
4133
        }
4134

4135 4136 4137 4138 4139
        /* Copy everything else as-is */
        else
            *p++ = (char) ch;
    }

4140 4141 4142 4143
    assert(p - PyBytes_AS_STRING(repr) > 0);
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
        return NULL;
    return repr;
4144 4145
}

4146
PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4147
{
4148
    PyObject *s;
4149 4150 4151 4152
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
4153 4154
    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
                                      PyUnicode_GET_SIZE(unicode));
4155
    return s;
4156 4157 4158 4159 4160
}

/* --- Raw Unicode Escape Codec ------------------------------------------- */

PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
4161 4162
                                           Py_ssize_t size,
                                           const char *errors)
4163
{
4164
    const char *starts = s;
Martin v. Löwis's avatar
Martin v. Löwis committed
4165 4166 4167
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
4168
    PyUnicodeObject *v;
4169
    Py_UNICODE *p;
4170 4171
    const char *end;
    const char *bs;
4172 4173
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
4174

4175 4176
    /* Escaped strings will always be longer than the resulting
       Unicode string, so we start with size here and then reduce the
4177 4178
       length after conversion to the true value. (But decoding error
       handler might have to resize the string) */
4179 4180
    v = _PyUnicode_New(size);
    if (v == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4181
        goto onError;
4182
    if (size == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
4183
        return (PyObject *)v;
4184
    p = PyUnicode_AS_UNICODE(v);
4185 4186
    end = s + size;
    while (s < end) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4187 4188 4189
        unsigned char c;
        Py_UCS4 x;
        int i;
4190
        int count;
4191

Benjamin Peterson's avatar
Benjamin Peterson committed
4192 4193 4194 4195 4196 4197
        /* Non-escape characters are interpreted as Unicode ordinals */
        if (*s != '\\') {
            *p++ = (unsigned char)*s++;
            continue;
        }
        startinpos = s-starts;
4198

Benjamin Peterson's avatar
Benjamin Peterson committed
4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212
        /* \u-escapes are only interpreted iff the number of leading
           backslashes if odd */
        bs = s;
        for (;s < end;) {
            if (*s != '\\')
                break;
            *p++ = (unsigned char)*s++;
        }
        if (((s - bs) & 1) == 0 ||
            s >= end ||
            (*s != 'u' && *s != 'U')) {
            continue;
        }
        p--;
4213
        count = *s=='u' ? 4 : 8;
Benjamin Peterson's avatar
Benjamin Peterson committed
4214
        s++;
4215

Benjamin Peterson's avatar
Benjamin Peterson committed
4216 4217 4218 4219
        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
        outpos = p-PyUnicode_AS_UNICODE(v);
        for (x = 0, i = 0; i < count; ++i, ++s) {
            c = (unsigned char)*s;
4220
            if (!Py_ISXDIGIT(c)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236
                endinpos = s-starts;
                if (unicode_decode_call_errorhandler(
                        errors, &errorHandler,
                        "rawunicodeescape", "truncated \\uXXXX",
                        &starts, &end, &startinpos, &endinpos, &exc, &s,
                        &v, &outpos, &p))
                    goto onError;
                goto nextByte;
            }
            x = (x<<4) & ~0xF;
            if (c >= '0' && c <= '9')
                x += c - '0';
            else if (c >= 'a' && c <= 'f')
                x += 10 + c - 'a';
            else
                x += 10 + c - 'A';
4237
        }
Christian Heimes's avatar
Christian Heimes committed
4238
        if (x <= 0xffff)
Benjamin Peterson's avatar
Benjamin Peterson committed
4239 4240
            /* UCS-2 character */
            *p++ = (Py_UNICODE) x;
Christian Heimes's avatar
Christian Heimes committed
4241
        else if (x <= 0x10ffff) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4242 4243
            /* UCS-4 character. Either store directly, or as
               surrogate pair. */
Christian Heimes's avatar
Christian Heimes committed
4244
#ifdef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
4245
            *p++ = (Py_UNICODE) x;
Christian Heimes's avatar
Christian Heimes committed
4246
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
4247 4248 4249
            x -= 0x10000L;
            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimes's avatar
Christian Heimes committed
4250 4251 4252 4253
#endif
        } else {
            endinpos = s-starts;
            outpos = p-PyUnicode_AS_UNICODE(v);
4254 4255 4256
            if (unicode_decode_call_errorhandler(
                    errors, &errorHandler,
                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson's avatar
Benjamin Peterson committed
4257 4258 4259
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
                    &v, &outpos, &p))
                goto onError;
4260
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
4261 4262
      nextByte:
        ;
4263
    }
4264
    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
4265
        goto onError;
4266 4267
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
4268
    return (PyObject *)v;
4269

Benjamin Peterson's avatar
Benjamin Peterson committed
4270
  onError:
4271
    Py_XDECREF(v);
4272 4273
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
4274 4275 4276 4277
    return NULL;
}

PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
4278
                                           Py_ssize_t size)
4279
{
4280
    PyObject *repr;
4281 4282 4283
    char *p;
    char *q;

4284
#ifdef Py_UNICODE_WIDE
Neal Norwitz's avatar
Neal Norwitz committed
4285
    const Py_ssize_t expandsize = 10;
4286
#else
Neal Norwitz's avatar
Neal Norwitz committed
4287
    const Py_ssize_t expandsize = 6;
4288
#endif
4289

Neal Norwitz's avatar
Neal Norwitz committed
4290
    if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson's avatar
Benjamin Peterson committed
4291
        return PyErr_NoMemory();
4292

4293
    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
4294 4295
    if (repr == NULL)
        return NULL;
4296
    if (size == 0)
4297
        return repr;
4298

4299
    p = q = PyBytes_AS_STRING(repr);
4300 4301
    while (size-- > 0) {
        Py_UNICODE ch = *s++;
4302
#ifdef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
4303 4304
        /* Map 32-bit characters to '\Uxxxxxxxx' */
        if (ch >= 0x10000) {
4305 4306
            *p++ = '\\';
            *p++ = 'U';
4307 4308 4309 4310 4311 4312 4313 4314
            *p++ = hexdigits[(ch >> 28) & 0xf];
            *p++ = hexdigits[(ch >> 24) & 0xf];
            *p++ = hexdigits[(ch >> 20) & 0xf];
            *p++ = hexdigits[(ch >> 16) & 0xf];
            *p++ = hexdigits[(ch >> 12) & 0xf];
            *p++ = hexdigits[(ch >> 8) & 0xf];
            *p++ = hexdigits[(ch >> 4) & 0xf];
            *p++ = hexdigits[ch & 15];
4315
        }
4316
        else
Christian Heimes's avatar
Christian Heimes committed
4317
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
4318 4319 4320 4321 4322 4323 4324
            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
            if (ch >= 0xD800 && ch < 0xDC00) {
                Py_UNICODE ch2;
                Py_UCS4 ucs;

                ch2 = *s++;
                size--;
4325
                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342
                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
                    *p++ = '\\';
                    *p++ = 'U';
                    *p++ = hexdigits[(ucs >> 28) & 0xf];
                    *p++ = hexdigits[(ucs >> 24) & 0xf];
                    *p++ = hexdigits[(ucs >> 20) & 0xf];
                    *p++ = hexdigits[(ucs >> 16) & 0xf];
                    *p++ = hexdigits[(ucs >> 12) & 0xf];
                    *p++ = hexdigits[(ucs >> 8) & 0xf];
                    *p++ = hexdigits[(ucs >> 4) & 0xf];
                    *p++ = hexdigits[ucs & 0xf];
                    continue;
                }
                /* Fall through: isolated surrogates are copied as-is */
                s--;
                size++;
            }
4343
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
4344 4345
        /* Map 16-bit characters to '\uxxxx' */
        if (ch >= 256) {
4346 4347
            *p++ = '\\';
            *p++ = 'u';
4348 4349 4350 4351
            *p++ = hexdigits[(ch >> 12) & 0xf];
            *p++ = hexdigits[(ch >> 8) & 0xf];
            *p++ = hexdigits[(ch >> 4) & 0xf];
            *p++ = hexdigits[ch & 15];
4352
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
4353 4354
        /* Copy everything else as-is */
        else
4355 4356
            *p++ = (char) ch;
    }
4357 4358
    size = p - q;

4359 4360 4361 4362
    assert(size > 0);
    if (_PyBytes_Resize(&repr, size) < 0)
        return NULL;
    return repr;
4363 4364 4365 4366
}

PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
{
4367
    PyObject *s;
4368
    if (!PyUnicode_Check(unicode)) {
4369 4370
        PyErr_BadArgument();
        return NULL;
4371
    }
4372 4373 4374
    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
                                         PyUnicode_GET_SIZE(unicode));

4375
    return s;
4376
}
4377 4378 4379 4380

/* --- Unicode Internal Codec ------------------------------------------- */

PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
4381 4382
                                           Py_ssize_t size,
                                           const char *errors)
4383 4384
{
    const char *starts = s;
Martin v. Löwis's avatar
Martin v. Löwis committed
4385 4386 4387
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
4388 4389 4390 4391 4392 4393 4394
    PyUnicodeObject *v;
    Py_UNICODE *p;
    const char *end;
    const char *reason;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;

4395 4396 4397 4398
#ifdef Py_UNICODE_WIDE
    Py_UNICODE unimax = PyUnicode_GetMax();
#endif

4399
    /* XXX overflow detection missing */
4400 4401
    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
    if (v == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4402
        goto onError;
4403
    if (PyUnicode_GetSize((PyObject *)v) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
4404
        return (PyObject *)v;
4405 4406 4407 4408
    p = PyUnicode_AS_UNICODE(v);
    end = s + size;

    while (s < end) {
4409
        memcpy(p, s, sizeof(Py_UNICODE));
4410 4411 4412
        /* We have to sanity check the raw data, otherwise doom looms for
           some malformed UCS-4 data. */
        if (
Benjamin Peterson's avatar
Benjamin Peterson committed
4413
#ifdef Py_UNICODE_WIDE
4414
            *p > unimax || *p < 0 ||
Benjamin Peterson's avatar
Benjamin Peterson committed
4415
#endif
4416 4417
            end-s < Py_UNICODE_SIZE
            )
Benjamin Peterson's avatar
Benjamin Peterson committed
4418
        {
4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431
            startinpos = s - starts;
            if (end-s < Py_UNICODE_SIZE) {
                endinpos = end-starts;
                reason = "truncated input";
            }
            else {
                endinpos = s - starts + Py_UNICODE_SIZE;
                reason = "illegal code point (> 0x10FFFF)";
            }
            outpos = p - PyUnicode_AS_UNICODE(v);
            if (unicode_decode_call_errorhandler(
                    errors, &errorHandler,
                    "unicode_internal", reason,
4432
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4433
                    &v, &outpos, &p)) {
4434 4435 4436 4437 4438 4439 4440 4441 4442
                goto onError;
            }
        }
        else {
            p++;
            s += Py_UNICODE_SIZE;
        }
    }

4443
    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4444 4445 4446 4447 4448
        goto onError;
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return (PyObject *)v;

Benjamin Peterson's avatar
Benjamin Peterson committed
4449
  onError:
4450 4451 4452 4453 4454
    Py_XDECREF(v);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return NULL;
}
4455 4456 4457 4458

/* --- Latin-1 Codec ------------------------------------------------------ */

PyObject *PyUnicode_DecodeLatin1(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
4459 4460
                                 Py_ssize_t size,
                                 const char *errors)
4461 4462 4463
{
    PyUnicodeObject *v;
    Py_UNICODE *p;
4464
    const char *e, *unrolled_end;
4465

4466
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4467
    if (size == 1) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4468 4469
        Py_UNICODE r = *(unsigned char*)s;
        return PyUnicode_FromUnicode(&r, 1);
4470 4471
    }

4472 4473
    v = _PyUnicode_New(size);
    if (v == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4474
        goto onError;
4475
    if (size == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
4476
        return (PyObject *)v;
4477
    p = PyUnicode_AS_UNICODE(v);
4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491
    e = s + size;
    /* Unrolling the copy makes it much faster by reducing the looping
       overhead. This is similar to what many memcpy() implementations do. */
    unrolled_end = e - 4;
    while (s < unrolled_end) {
        p[0] = (unsigned char) s[0];
        p[1] = (unsigned char) s[1];
        p[2] = (unsigned char) s[2];
        p[3] = (unsigned char) s[3];
        s += 4;
        p += 4;
    }
    while (s < e)
        *p++ = (unsigned char) *s++;
4492
    return (PyObject *)v;
4493

Benjamin Peterson's avatar
Benjamin Peterson committed
4494
  onError:
4495 4496 4497 4498
    Py_XDECREF(v);
    return NULL;
}

4499 4500
/* create or adjust a UnicodeEncodeError */
static void make_encode_exception(PyObject **exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
4501 4502 4503 4504
                                  const char *encoding,
                                  const Py_UNICODE *unicode, Py_ssize_t size,
                                  Py_ssize_t startpos, Py_ssize_t endpos,
                                  const char *reason)
4505 4506
{
    if (*exceptionObject == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4507 4508
        *exceptionObject = PyUnicodeEncodeError_Create(
            encoding, unicode, size, startpos, endpos, reason);
4509
    }
4510
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
4511 4512 4513 4514 4515 4516 4517 4518 4519 4520
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
            goto onError;
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
            goto onError;
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
            goto onError;
        return;
      onError:
        Py_DECREF(*exceptionObject);
        *exceptionObject = NULL;
4521
    }
4522 4523 4524 4525
}

/* raises a UnicodeEncodeError */
static void raise_encode_exception(PyObject **exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
4526 4527 4528 4529
                                   const char *encoding,
                                   const Py_UNICODE *unicode, Py_ssize_t size,
                                   Py_ssize_t startpos, Py_ssize_t endpos,
                                   const char *reason)
4530 4531
{
    make_encode_exception(exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
4532
                          encoding, unicode, size, startpos, endpos, reason);
4533
    if (*exceptionObject != NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4534
        PyCodec_StrictErrors(*exceptionObject);
4535 4536 4537 4538 4539 4540 4541
}

/* error handling callback helper:
   build arguments, call the callback and check the arguments,
   put the result into newpos and return the replacement string, which
   has to be freed by the caller */
static PyObject *unicode_encode_call_errorhandler(const char *errors,
Benjamin Peterson's avatar
Benjamin Peterson committed
4542 4543 4544 4545 4546
                                                  PyObject **errorHandler,
                                                  const char *encoding, const char *reason,
                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
                                                  Py_ssize_t startpos, Py_ssize_t endpos,
                                                  Py_ssize_t *newpos)
4547
{
4548
    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4549 4550 4551 4552 4553

    PyObject *restuple;
    PyObject *resunicode;

    if (*errorHandler == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4554
        *errorHandler = PyCodec_LookupError(errors);
4555
        if (*errorHandler == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4556
            return NULL;
4557
    }
4558 4559

    make_encode_exception(exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
4560
                          encoding, unicode, size, startpos, endpos, reason);
4561
    if (*exceptionObject == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4562
        return NULL;
4563 4564

    restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson's avatar
Benjamin Peterson committed
4565
        *errorHandler, *exceptionObject, NULL);
4566
    if (restuple == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4567
        return NULL;
4568
    if (!PyTuple_Check(restuple)) {
4569
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson's avatar
Benjamin Peterson committed
4570 4571
        Py_DECREF(restuple);
        return NULL;
4572
    }
4573
    if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson's avatar
Benjamin Peterson committed
4574 4575 4576
                          &resunicode, newpos)) {
        Py_DECREF(restuple);
        return NULL;
4577
    }
4578 4579 4580 4581 4582
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
        Py_DECREF(restuple);
        return NULL;
    }
4583
    if (*newpos<0)
Benjamin Peterson's avatar
Benjamin Peterson committed
4584
        *newpos = size+*newpos;
4585
    if (*newpos<0 || *newpos>size) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4586 4587 4588
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
        Py_DECREF(restuple);
        return NULL;
4589
    }
4590 4591 4592
    Py_INCREF(resunicode);
    Py_DECREF(restuple);
    return resunicode;
4593 4594
}

4595
static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
Benjamin Peterson's avatar
Benjamin Peterson committed
4596 4597 4598
                                     Py_ssize_t size,
                                     const char *errors,
                                     int limit)
4599
{
4600 4601 4602 4603 4604 4605 4606 4607 4608 4609
    /* output object */
    PyObject *res;
    /* pointers to the beginning and end+1 of input */
    const Py_UNICODE *startp = p;
    const Py_UNICODE *endp = p + size;
    /* pointer to the beginning of the unencodable characters */
    /* const Py_UNICODE *badp = NULL; */
    /* pointer into the output */
    char *str;
    /* current output position */
Martin v. Löwis's avatar
Martin v. Löwis committed
4610
    Py_ssize_t ressize;
4611 4612
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4613 4614 4615 4616 4617 4618 4619 4620
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    /* the following variable is used for caching string comparisons
     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
    int known_errorHandler = -1;

    /* allocate enough for a simple encoding without
       replacements, if we need more, we'll resize */
4621
    if (size == 0)
4622
        return PyBytes_FromStringAndSize(NULL, 0);
4623
    res = PyBytes_FromStringAndSize(NULL, size);
4624
    if (res == NULL)
4625
        return NULL;
4626
    str = PyBytes_AS_STRING(res);
4627 4628 4629
    ressize = size;

    while (p<endp) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4630
        Py_UNICODE c = *p;
4631

Benjamin Peterson's avatar
Benjamin Peterson committed
4632 4633 4634 4635 4636
        /* can we encode this? */
        if (c<limit) {
            /* no overflow check, because we know that the space is enough */
            *str++ = (char)c;
            ++p;
4637
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686
        else {
            Py_ssize_t unicodepos = p-startp;
            Py_ssize_t requiredsize;
            PyObject *repunicode;
            Py_ssize_t repsize;
            Py_ssize_t newpos;
            Py_ssize_t respos;
            Py_UNICODE *uni2;
            /* startpos for collecting unencodable chars */
            const Py_UNICODE *collstart = p;
            const Py_UNICODE *collend = p;
            /* find all unecodable characters */
            while ((collend < endp) && ((*collend)>=limit))
                ++collend;
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
            if (known_errorHandler==-1) {
                if ((errors==NULL) || (!strcmp(errors, "strict")))
                    known_errorHandler = 1;
                else if (!strcmp(errors, "replace"))
                    known_errorHandler = 2;
                else if (!strcmp(errors, "ignore"))
                    known_errorHandler = 3;
                else if (!strcmp(errors, "xmlcharrefreplace"))
                    known_errorHandler = 4;
                else
                    known_errorHandler = 0;
            }
            switch (known_errorHandler) {
            case 1: /* strict */
                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
                goto onError;
            case 2: /* replace */
                while (collstart++<collend)
                    *str++ = '?'; /* fall through */
            case 3: /* ignore */
                p = collend;
                break;
            case 4: /* xmlcharrefreplace */
                respos = str - PyBytes_AS_STRING(res);
                /* determine replacement size (temporarily (mis)uses p) */
                for (p = collstart, repsize = 0; p < collend; ++p) {
                    if (*p<10)
                        repsize += 2+1+1;
                    else if (*p<100)
                        repsize += 2+2+1;
                    else if (*p<1000)
                        repsize += 2+3+1;
                    else if (*p<10000)
                        repsize += 2+4+1;
4687
#ifndef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
4688 4689
                    else
                        repsize += 2+5+1;
4690
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
4691 4692 4693 4694 4695 4696
                    else if (*p<100000)
                        repsize += 2+5+1;
                    else if (*p<1000000)
                        repsize += 2+6+1;
                    else
                        repsize += 2+7+1;
4697
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719
                }
                requiredsize = respos+repsize+(endp-collend);
                if (requiredsize > ressize) {
                    if (requiredsize<2*ressize)
                        requiredsize = 2*ressize;
                    if (_PyBytes_Resize(&res, requiredsize))
                        goto onError;
                    str = PyBytes_AS_STRING(res) + respos;
                    ressize = requiredsize;
                }
                /* generate replacement (temporarily (mis)uses p) */
                for (p = collstart; p < collend; ++p) {
                    str += sprintf(str, "&#%d;", (int)*p);
                }
                p = collend;
                break;
            default:
                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
                                                              encoding, reason, startp, size, &exc,
                                                              collstart-startp, collend-startp, &newpos);
                if (repunicode == NULL)
                    goto onError;
4720 4721 4722 4723 4724
                if (PyBytes_Check(repunicode)) {
                    /* Directly copy bytes result to output. */
                    repsize = PyBytes_Size(repunicode);
                    if (repsize > 1) {
                        /* Make room for all additional bytes. */
4725
                        respos = str - PyBytes_AS_STRING(res);
4726 4727 4728 4729
                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
                            Py_DECREF(repunicode);
                            goto onError;
                        }
4730
                        str = PyBytes_AS_STRING(res) + respos;
4731 4732 4733 4734 4735
                        ressize += repsize-1;
                    }
                    memcpy(str, PyBytes_AsString(repunicode), repsize);
                    str += repsize;
                    p = startp + newpos;
4736
                    Py_DECREF(repunicode);
4737
                    break;
4738
                }
Benjamin Peterson's avatar
Benjamin Peterson committed
4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767
                /* need more space? (at least enough for what we
                   have+the replacement+the rest of the string, so
                   we won't have to check space for encodable characters) */
                respos = str - PyBytes_AS_STRING(res);
                repsize = PyUnicode_GET_SIZE(repunicode);
                requiredsize = respos+repsize+(endp-collend);
                if (requiredsize > ressize) {
                    if (requiredsize<2*ressize)
                        requiredsize = 2*ressize;
                    if (_PyBytes_Resize(&res, requiredsize)) {
                        Py_DECREF(repunicode);
                        goto onError;
                    }
                    str = PyBytes_AS_STRING(res) + respos;
                    ressize = requiredsize;
                }
                /* check if there is anything unencodable in the replacement
                   and copy it to the output */
                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
                    c = *uni2;
                    if (c >= limit) {
                        raise_encode_exception(&exc, encoding, startp, size,
                                               unicodepos, unicodepos+1, reason);
                        Py_DECREF(repunicode);
                        goto onError;
                    }
                    *str = (char)c;
                }
                p = startp + newpos;
4768 4769 4770 4771
                Py_DECREF(repunicode);
            }
        }
    }
4772 4773 4774
    /* Resize if we allocated to much */
    size = str - PyBytes_AS_STRING(res);
    if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalotti's avatar
Alexandre Vassalotti committed
4775
        assert(size >= 0);
4776 4777 4778 4779 4780 4781 4782 4783
        if (_PyBytes_Resize(&res, size) < 0)
            goto onError;
    }

    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    return res;

4784
  onError:
4785
    Py_XDECREF(res);
4786 4787
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
4788
    return NULL;
4789 4790
}

4791
PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Benjamin Peterson's avatar
Benjamin Peterson committed
4792 4793
                                 Py_ssize_t size,
                                 const char *errors)
4794 4795 4796 4797
{
    return unicode_encode_ucs1(p, size, errors, 256);
}

4798 4799 4800
PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4801 4802
        PyErr_BadArgument();
        return NULL;
4803 4804
    }
    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson's avatar
Benjamin Peterson committed
4805 4806
                                  PyUnicode_GET_SIZE(unicode),
                                  NULL);
4807 4808 4809 4810 4811
}

/* --- 7-bit ASCII Codec -------------------------------------------------- */

PyObject *PyUnicode_DecodeASCII(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
4812 4813
                                Py_ssize_t size,
                                const char *errors)
4814
{
4815
    const char *starts = s;
4816 4817
    PyUnicodeObject *v;
    Py_UNICODE *p;
Martin v. Löwis's avatar
Martin v. Löwis committed
4818 4819 4820
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
4821 4822 4823
    const char *e;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
4824

4825
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4826
    if (size == 1 && *(unsigned char*)s < 128) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4827 4828
        Py_UNICODE r = *(unsigned char*)s;
        return PyUnicode_FromUnicode(&r, 1);
4829
    }
4830

4831 4832
    v = _PyUnicode_New(size);
    if (v == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
4833
        goto onError;
4834
    if (size == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
4835
        return (PyObject *)v;
4836
    p = PyUnicode_AS_UNICODE(v);
4837 4838
    e = s + size;
    while (s < e) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854
        register unsigned char c = (unsigned char)*s;
        if (c < 128) {
            *p++ = c;
            ++s;
        }
        else {
            startinpos = s-starts;
            endinpos = startinpos + 1;
            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
            if (unicode_decode_call_errorhandler(
                    errors, &errorHandler,
                    "ascii", "ordinal not in range(128)",
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
                    &v, &outpos, &p))
                goto onError;
        }
4855
    }
4856
    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson's avatar
Benjamin Peterson committed
4857 4858
        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
            goto onError;
4859 4860
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
4861
    return (PyObject *)v;
4862

Benjamin Peterson's avatar
Benjamin Peterson committed
4863
  onError:
4864
    Py_XDECREF(v);
4865 4866
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
4867 4868 4869 4870
    return NULL;
}

PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
Benjamin Peterson's avatar
Benjamin Peterson committed
4871 4872
                                Py_ssize_t size,
                                const char *errors)
4873
{
4874
    return unicode_encode_ucs1(p, size, errors, 128);
4875 4876 4877 4878 4879
}

PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4880 4881
        PyErr_BadArgument();
        return NULL;
4882 4883
    }
    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson's avatar
Benjamin Peterson committed
4884 4885
                                 PyUnicode_GET_SIZE(unicode),
                                 NULL);
4886 4887
}

4888
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum's avatar
Guido van Rossum committed
4889

4890
/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum's avatar
Guido van Rossum committed
4891

4892
#if SIZEOF_INT < SIZEOF_SIZE_T
4893 4894 4895 4896 4897 4898
#define NEED_RETRY
#endif

/* XXX This code is limited to "true" double-byte encodings, as
   a) it assumes an incomplete character consists of a single byte, and
   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson's avatar
Benjamin Peterson committed
4899
   encodings, see IsDBCSLeadByteEx documentation. */
4900 4901 4902 4903 4904 4905

static int is_dbcs_lead_byte(const char *s, int offset)
{
    const char *curr = s + offset;

    if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4906 4907
        const char *prev = CharPrev(s, curr);
        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4908 4909 4910 4911 4912 4913 4914 4915 4916
    }
    return 0;
}

/*
 * Decode MBCS string into unicode object. If 'final' is set, converts
 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
 */
static int decode_mbcs(PyUnicodeObject **v,
Benjamin Peterson's avatar
Benjamin Peterson committed
4917 4918
                       const char *s, /* MBCS string */
                       int size, /* sizeof MBCS string */
4919 4920
                       int final,
                       const char *errors)
4921 4922
{
    Py_UNICODE *p;
4923 4924 4925
    Py_ssize_t n;
    DWORD usize;
    DWORD flags;
4926 4927 4928

    assert(size >= 0);

4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940
    /* check and handle 'errors' arg */
    if (errors==NULL || strcmp(errors, "strict")==0)
        flags = MB_ERR_INVALID_CHARS;
    else if (strcmp(errors, "ignore")==0)
        flags = 0;
    else {
        PyErr_Format(PyExc_ValueError,
                     "mbcs encoding does not support errors='%s'",
                     errors);
        return -1;
    }

4941 4942
    /* Skip trailing lead-byte unless 'final' is set */
    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson's avatar
Benjamin Peterson committed
4943
        --size;
4944 4945

    /* First get the size of the result */
4946
    if (size > 0) {
4947 4948 4949 4950 4951
        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
        if (usize==0)
            goto mbcs_decode_error;
    } else
        usize = 0;
4952

4953
    if (*v == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4954 4955 4956 4957
        /* Create unicode object */
        *v = _PyUnicode_New(usize);
        if (*v == NULL)
            return -1;
4958
        n = 0;
4959 4960
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
4961 4962 4963 4964
        /* Extend unicode object */
        n = PyUnicode_GET_SIZE(*v);
        if (_PyUnicode_Resize(v, n + usize) < 0)
            return -1;
4965 4966 4967
    }

    /* Do the conversion */
4968
    if (usize > 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
4969
        p = PyUnicode_AS_UNICODE(*v) + n;
4970 4971
        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
            goto mbcs_decode_error;
Benjamin Peterson's avatar
Benjamin Peterson committed
4972
        }
4973 4974
    }
    return size;
4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996

mbcs_decode_error:
    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
       we raise a UnicodeDecodeError - else it is a 'generic'
       windows error
     */
    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
        /* Ideally, we should get reason from FormatMessage - this
           is the Windows 2000 English version of the message
        */
        PyObject *exc = NULL;
        const char *reason = "No mapping for the Unicode character exists "
                             "in the target multi-byte code page.";
        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
        if (exc != NULL) {
            PyCodec_StrictErrors(exc);
            Py_DECREF(exc);
        }
    } else {
        PyErr_SetFromWindowsErrWithFilename(0, NULL);
    }
    return -1;
4997 4998 4999
}

PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
5000 5001 5002
                                       Py_ssize_t size,
                                       const char *errors,
                                       Py_ssize_t *consumed)
5003 5004 5005 5006 5007
{
    PyUnicodeObject *v = NULL;
    int done;

    if (consumed)
Benjamin Peterson's avatar
Benjamin Peterson committed
5008
        *consumed = 0;
5009 5010 5011 5012

#ifdef NEED_RETRY
  retry:
    if (size > INT_MAX)
5013
        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
5014 5015
    else
#endif
5016
        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
5017 5018 5019

    if (done < 0) {
        Py_XDECREF(v);
Benjamin Peterson's avatar
Benjamin Peterson committed
5020
        return NULL;
5021 5022 5023
    }

    if (consumed)
Benjamin Peterson's avatar
Benjamin Peterson committed
5024
        *consumed += done;
5025 5026 5027

#ifdef NEED_RETRY
    if (size > INT_MAX) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5028 5029 5030
        s += done;
        size -= done;
        goto retry;
5031
    }
5032
#endif
5033 5034 5035 5036

    return (PyObject *)v;
}

5037
PyObject *PyUnicode_DecodeMBCS(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
5038 5039
                               Py_ssize_t size,
                               const char *errors)
5040
{
5041 5042 5043 5044 5045 5046 5047 5048
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
}

/*
 * Convert unicode into string object (MBCS).
 * Returns 0 if succeed, -1 otherwise.
 */
static int encode_mbcs(PyObject **repr,
Benjamin Peterson's avatar
Benjamin Peterson committed
5049
                       const Py_UNICODE *p, /* unicode */
5050 5051
                       int size, /* size of unicode */
                       const char* errors)
5052
{
5053 5054 5055 5056 5057 5058
    BOOL usedDefaultChar = FALSE;
    BOOL *pusedDefaultChar;
    int mbcssize;
    Py_ssize_t n;
    PyObject *exc = NULL;
    DWORD flags;
5059

5060
    assert(size >= 0);
5061

5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075
    /* check and handle 'errors' arg */
    if (errors==NULL || strcmp(errors, "strict")==0) {
        flags = WC_NO_BEST_FIT_CHARS;
        pusedDefaultChar = &usedDefaultChar;
    } else if (strcmp(errors, "replace")==0) {
        flags = 0;
        pusedDefaultChar = NULL;
    } else {
         PyErr_Format(PyExc_ValueError,
                      "mbcs encoding does not support errors='%s'",
                      errors);
         return -1;
    }

5076
    /* First get the size of the result */
5077
    if (size > 0) {
5078 5079
        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
                                       NULL, pusedDefaultChar);
Benjamin Peterson's avatar
Benjamin Peterson committed
5080 5081 5082 5083
        if (mbcssize == 0) {
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
            return -1;
        }
5084 5085 5086 5087 5088
        /* If we used a default char, then we failed! */
        if (pusedDefaultChar && *pusedDefaultChar)
            goto mbcs_encode_error;
    } else {
        mbcssize = 0;
5089
    }
5090

5091
    if (*repr == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5092 5093 5094 5095
        /* Create string object */
        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
        if (*repr == NULL)
            return -1;
5096
        n = 0;
5097 5098
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
5099 5100 5101 5102
        /* Extend string object */
        n = PyBytes_Size(*repr);
        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
            return -1;
5103
    }
5104 5105

    /* Do the conversion */
5106
    if (size > 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5107
        char *s = PyBytes_AS_STRING(*repr) + n;
5108 5109
        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
                                     NULL, pusedDefaultChar)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5110 5111 5112
            PyErr_SetFromWindowsErrWithFilename(0, NULL);
            return -1;
        }
5113 5114
        if (pusedDefaultChar && *pusedDefaultChar)
            goto mbcs_encode_error;
5115 5116
    }
    return 0;
5117 5118 5119 5120 5121

mbcs_encode_error:
    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
    Py_XDECREF(exc);
    return -1;
5122 5123 5124
}

PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
Benjamin Peterson's avatar
Benjamin Peterson committed
5125 5126
                               Py_ssize_t size,
                               const char *errors)
5127 5128 5129 5130 5131
{
    PyObject *repr = NULL;
    int ret;

#ifdef NEED_RETRY
Benjamin Peterson's avatar
Benjamin Peterson committed
5132
  retry:
5133
    if (size > INT_MAX)
5134
        ret = encode_mbcs(&repr, p, INT_MAX, errors);
5135 5136
    else
#endif
5137
        ret = encode_mbcs(&repr, p, (int)size, errors);
5138 5139

    if (ret < 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5140 5141
        Py_XDECREF(repr);
        return NULL;
5142
    }
5143 5144 5145

#ifdef NEED_RETRY
    if (size > INT_MAX) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5146 5147 5148
        p += INT_MAX;
        size -= INT_MAX;
        goto retry;
5149 5150 5151
    }
#endif

5152 5153
    return repr;
}
Guido van Rossum's avatar
Guido van Rossum committed
5154

5155 5156 5157 5158 5159 5160 5161
PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
{
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson's avatar
Benjamin Peterson committed
5162 5163
                                PyUnicode_GET_SIZE(unicode),
                                NULL);
5164 5165
}

5166 5167
#undef NEED_RETRY

5168
#endif /* MS_WINDOWS */
5169

5170 5171 5172
/* --- Character Mapping Codec -------------------------------------------- */

PyObject *PyUnicode_DecodeCharmap(const char *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
5173 5174 5175
                                  Py_ssize_t size,
                                  PyObject *mapping,
                                  const char *errors)
5176
{
5177
    const char *starts = s;
Martin v. Löwis's avatar
Martin v. Löwis committed
5178 5179 5180
    Py_ssize_t startinpos;
    Py_ssize_t endinpos;
    Py_ssize_t outpos;
5181
    const char *e;
5182 5183
    PyUnicodeObject *v;
    Py_UNICODE *p;
Martin v. Löwis's avatar
Martin v. Löwis committed
5184
    Py_ssize_t extrachars = 0;
5185 5186
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
5187
    Py_UNICODE *mapstring = NULL;
Martin v. Löwis's avatar
Martin v. Löwis committed
5188
    Py_ssize_t maplen = 0;
5189

5190 5191
    /* Default to Latin-1 */
    if (mapping == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5192
        return PyUnicode_DecodeLatin1(s, size, errors);
5193 5194 5195

    v = _PyUnicode_New(size);
    if (v == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5196
        goto onError;
5197
    if (size == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
5198
        return (PyObject *)v;
5199
    p = PyUnicode_AS_UNICODE(v);
5200
    e = s + size;
5201
    if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5202 5203 5204 5205 5206
        mapstring = PyUnicode_AS_UNICODE(mapping);
        maplen = PyUnicode_GET_SIZE(mapping);
        while (s < e) {
            unsigned char ch = *s;
            Py_UNICODE x = 0xfffe; /* illegal value */
5207

Benjamin Peterson's avatar
Benjamin Peterson committed
5208 5209
            if (ch < maplen)
                x = mapstring[ch];
5210

Benjamin Peterson's avatar
Benjamin Peterson committed
5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226
            if (x == 0xfffe) {
                /* undefined mapping */
                outpos = p-PyUnicode_AS_UNICODE(v);
                startinpos = s-starts;
                endinpos = startinpos+1;
                if (unicode_decode_call_errorhandler(
                        errors, &errorHandler,
                        "charmap", "character maps to <undefined>",
                        &starts, &e, &startinpos, &endinpos, &exc, &s,
                        &v, &outpos, &p)) {
                    goto onError;
                }
                continue;
            }
            *p++ = x;
            ++s;
5227
        }
5228 5229
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
5230 5231 5232
        while (s < e) {
            unsigned char ch = *s;
            PyObject *w, *x;
5233

Benjamin Peterson's avatar
Benjamin Peterson committed
5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248
            /* Get mapping (char ordinal -> integer, Unicode char or None) */
            w = PyLong_FromLong((long)ch);
            if (w == NULL)
                goto onError;
            x = PyObject_GetItem(mapping, w);
            Py_DECREF(w);
            if (x == NULL) {
                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
                    /* No mapping found means: mapping is undefined. */
                    PyErr_Clear();
                    x = Py_None;
                    Py_INCREF(x);
                } else
                    goto onError;
            }
5249

Benjamin Peterson's avatar
Benjamin Peterson committed
5250 5251 5252
            /* Apply mapping */
            if (PyLong_Check(x)) {
                long value = PyLong_AS_LONG(x);
5253
                if (value < 0 || value > 0x10FFFF) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5254
                    PyErr_SetString(PyExc_TypeError,
5255
                                    "character mapping must be in range(0x110000)");
Benjamin Peterson's avatar
Benjamin Peterson committed
5256 5257 5258
                    Py_DECREF(x);
                    goto onError;
                }
5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282

#ifndef Py_UNICODE_WIDE
                if (value > 0xFFFF) {
                    /* see the code for 1-n mapping below */
                    if (extrachars < 2) {
                        /* resize first */
                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
                        Py_ssize_t needed = 10 - extrachars;
                        extrachars += needed;
                        /* XXX overflow detection missing */
                        if (_PyUnicode_Resize(&v,
                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
                            Py_DECREF(x);
                            goto onError;
                        }
                        p = PyUnicode_AS_UNICODE(v) + oldpos;
                    }
                    value -= 0x10000;
                    *p++ = 0xD800 | (value >> 10);
                    *p++ = 0xDC00 | (value & 0x3FF);
                    extrachars -= 2;
                }
                else
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297
                *p++ = (Py_UNICODE)value;
            }
            else if (x == Py_None) {
                /* undefined mapping */
                outpos = p-PyUnicode_AS_UNICODE(v);
                startinpos = s-starts;
                endinpos = startinpos+1;
                if (unicode_decode_call_errorhandler(
                        errors, &errorHandler,
                        "charmap", "character maps to <undefined>",
                        &starts, &e, &startinpos, &endinpos, &exc, &s,
                        &v, &outpos, &p)) {
                    Py_DECREF(x);
                    goto onError;
                }
5298
                Py_DECREF(x);
Benjamin Peterson's avatar
Benjamin Peterson committed
5299
                continue;
5300
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330
            else if (PyUnicode_Check(x)) {
                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);

                if (targetsize == 1)
                    /* 1-1 mapping */
                    *p++ = *PyUnicode_AS_UNICODE(x);

                else if (targetsize > 1) {
                    /* 1-n mapping */
                    if (targetsize > extrachars) {
                        /* resize first */
                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
                        Py_ssize_t needed = (targetsize - extrachars) + \
                            (targetsize << 2);
                        extrachars += needed;
                        /* XXX overflow detection missing */
                        if (_PyUnicode_Resize(&v,
                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
                            Py_DECREF(x);
                            goto onError;
                        }
                        p = PyUnicode_AS_UNICODE(v) + oldpos;
                    }
                    Py_UNICODE_COPY(p,
                                    PyUnicode_AS_UNICODE(x),
                                    targetsize);
                    p += targetsize;
                    extrachars -= targetsize;
                }
                /* 1-0 mapping: skip the character */
5331
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
5332 5333 5334 5335 5336 5337 5338 5339 5340
            else {
                /* wrong return value */
                PyErr_SetString(PyExc_TypeError,
                                "character mapping must return integer, None or str");
                Py_DECREF(x);
                goto onError;
            }
            Py_DECREF(x);
            ++s;
5341
        }
5342 5343
    }
    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Benjamin Peterson's avatar
Benjamin Peterson committed
5344 5345
        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
            goto onError;
5346 5347
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
5348
    return (PyObject *)v;
5349

Benjamin Peterson's avatar
Benjamin Peterson committed
5350
  onError:
5351 5352
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
5353 5354 5355 5356
    Py_XDECREF(v);
    return NULL;
}

5357 5358 5359
/* Charmap encoding: the lookup table */

struct encoding_map{
Benjamin Peterson's avatar
Benjamin Peterson committed
5360 5361 5362 5363
    PyObject_HEAD
    unsigned char level1[32];
    int count2, count3;
    unsigned char level23[1];
5364 5365 5366 5367 5368 5369
};

static PyObject*
encoding_map_size(PyObject *obj, PyObject* args)
{
    struct encoding_map *map = (struct encoding_map*)obj;
5370
    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson's avatar
Benjamin Peterson committed
5371
                           128*map->count3);
5372 5373 5374
}

static PyMethodDef encoding_map_methods[] = {
5375
    {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson's avatar
Benjamin Peterson committed
5376 5377
     PyDoc_STR("Return the size (in bytes) of this object") },
    { 0 }
5378 5379 5380 5381 5382
};

static void
encoding_map_dealloc(PyObject* o)
{
5383
    PyObject_FREE(o);
5384 5385 5386
}

static PyTypeObject EncodingMapType = {
5387
    PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
5388 5389 5390 5391 5392 5393 5394 5395
    "EncodingMap",          /*tp_name*/
    sizeof(struct encoding_map),   /*tp_basicsize*/
    0,                      /*tp_itemsize*/
    /* methods */
    encoding_map_dealloc,   /*tp_dealloc*/
    0,                      /*tp_print*/
    0,                      /*tp_getattr*/
    0,                      /*tp_setattr*/
5396
    0,                      /*tp_reserved*/
Benjamin Peterson's avatar
Benjamin Peterson committed
5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427
    0,                      /*tp_repr*/
    0,                      /*tp_as_number*/
    0,                      /*tp_as_sequence*/
    0,                      /*tp_as_mapping*/
    0,                      /*tp_hash*/
    0,                      /*tp_call*/
    0,                      /*tp_str*/
    0,                      /*tp_getattro*/
    0,                      /*tp_setattro*/
    0,                      /*tp_as_buffer*/
    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
    0,                      /*tp_doc*/
    0,                      /*tp_traverse*/
    0,                      /*tp_clear*/
    0,                      /*tp_richcompare*/
    0,                      /*tp_weaklistoffset*/
    0,                      /*tp_iter*/
    0,                      /*tp_iternext*/
    encoding_map_methods,   /*tp_methods*/
    0,                      /*tp_members*/
    0,                      /*tp_getset*/
    0,                      /*tp_base*/
    0,                      /*tp_dict*/
    0,                      /*tp_descr_get*/
    0,                      /*tp_descr_set*/
    0,                      /*tp_dictoffset*/
    0,                      /*tp_init*/
    0,                      /*tp_alloc*/
    0,                      /*tp_new*/
    0,                      /*tp_free*/
    0,                      /*tp_is_gc*/
5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458
};

PyObject*
PyUnicode_BuildEncodingMap(PyObject* string)
{
    Py_UNICODE *decode;
    PyObject *result;
    struct encoding_map *mresult;
    int i;
    int need_dict = 0;
    unsigned char level1[32];
    unsigned char level2[512];
    unsigned char *mlevel1, *mlevel2, *mlevel3;
    int count2 = 0, count3 = 0;

    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
        PyErr_BadArgument();
        return NULL;
    }
    decode = PyUnicode_AS_UNICODE(string);
    memset(level1, 0xFF, sizeof level1);
    memset(level2, 0xFF, sizeof level2);

    /* If there isn't a one-to-one mapping of NULL to \0,
       or if there are non-BMP characters, we need to use
       a mapping dictionary. */
    if (decode[0] != 0)
        need_dict = 1;
    for (i = 1; i < 256; i++) {
        int l1, l2;
        if (decode[i] == 0
Benjamin Peterson's avatar
Benjamin Peterson committed
5459
#ifdef Py_UNICODE_WIDE
5460
            || decode[i] > 0xFFFF
Benjamin Peterson's avatar
Benjamin Peterson committed
5461 5462
#endif
            ) {
5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473
            need_dict = 1;
            break;
        }
        if (decode[i] == 0xFFFE)
            /* unmapped character */
            continue;
        l1 = decode[i] >> 11;
        l2 = decode[i] >> 7;
        if (level1[l1] == 0xFF)
            level1[l1] = count2++;
        if (level2[l2] == 0xFF)
5474
            level2[l2] = count3++;
5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486
    }

    if (count2 >= 0xFF || count3 >= 0xFF)
        need_dict = 1;

    if (need_dict) {
        PyObject *result = PyDict_New();
        PyObject *key, *value;
        if (!result)
            return NULL;
        for (i = 0; i < 256; i++) {
            key = value = NULL;
5487 5488
            key = PyLong_FromLong(decode[i]);
            value = PyLong_FromLong(i);
5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547
            if (!key || !value)
                goto failed1;
            if (PyDict_SetItem(result, key, value) == -1)
                goto failed1;
            Py_DECREF(key);
            Py_DECREF(value);
        }
        return result;
      failed1:
        Py_XDECREF(key);
        Py_XDECREF(value);
        Py_DECREF(result);
        return NULL;
    }

    /* Create a three-level trie */
    result = PyObject_MALLOC(sizeof(struct encoding_map) +
                             16*count2 + 128*count3 - 1);
    if (!result)
        return PyErr_NoMemory();
    PyObject_Init(result, &EncodingMapType);
    mresult = (struct encoding_map*)result;
    mresult->count2 = count2;
    mresult->count3 = count3;
    mlevel1 = mresult->level1;
    mlevel2 = mresult->level23;
    mlevel3 = mresult->level23 + 16*count2;
    memcpy(mlevel1, level1, 32);
    memset(mlevel2, 0xFF, 16*count2);
    memset(mlevel3, 0, 128*count3);
    count3 = 0;
    for (i = 1; i < 256; i++) {
        int o1, o2, o3, i2, i3;
        if (decode[i] == 0xFFFE)
            /* unmapped character */
            continue;
        o1 = decode[i]>>11;
        o2 = (decode[i]>>7) & 0xF;
        i2 = 16*mlevel1[o1] + o2;
        if (mlevel2[i2] == 0xFF)
            mlevel2[i2] = count3++;
        o3 = decode[i] & 0x7F;
        i3 = 128*mlevel2[i2] + o3;
        mlevel3[i3] = i;
    }
    return result;
}

static int
encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
{
    struct encoding_map *map = (struct encoding_map*)mapping;
    int l1 = c>>11;
    int l2 = (c>>7) & 0xF;
    int l3 = c & 0x7F;
    int i;

#ifdef Py_UNICODE_WIDE
    if (c > 0xFFFF) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5548
        return -1;
5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570
    }
#endif
    if (c == 0)
        return 0;
    /* level 1*/
    i = map->level1[l1];
    if (i == 0xFF) {
        return -1;
    }
    /* level 2*/
    i = map->level23[16*i+l2];
    if (i == 0xFF) {
        return -1;
    }
    /* level 3 */
    i = map->level23[16*map->count2 + 128*i + l3];
    if (i == 0) {
        return -1;
    }
    return i;
}

5571 5572
/* Lookup the character ch in the mapping. If the character
   can't be found, Py_None is returned (or NULL, if another
Fred Drake's avatar
Fred Drake committed
5573
   error occurred). */
5574 5575
static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
{
5576
    PyObject *w = PyLong_FromLong((long)c);
5577 5578 5579
    PyObject *x;

    if (w == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5580
        return NULL;
5581 5582 5583
    x = PyObject_GetItem(mapping, w);
    Py_DECREF(w);
    if (x == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5584 5585 5586 5587 5588 5589 5590 5591
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
            /* No mapping found means: mapping is undefined. */
            PyErr_Clear();
            x = Py_None;
            Py_INCREF(x);
            return x;
        } else
            return NULL;
5592
    }
5593
    else if (x == Py_None)
Benjamin Peterson's avatar
Benjamin Peterson committed
5594
        return x;
5595
    else if (PyLong_Check(x)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5596 5597 5598 5599 5600 5601 5602 5603
        long value = PyLong_AS_LONG(x);
        if (value < 0 || value > 255) {
            PyErr_SetString(PyExc_TypeError,
                            "character mapping must be in range(256)");
            Py_DECREF(x);
            return NULL;
        }
        return x;
5604
    }
5605
    else if (PyBytes_Check(x))
Benjamin Peterson's avatar
Benjamin Peterson committed
5606
        return x;
5607
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
5608 5609 5610 5611 5612 5613
        /* wrong return value */
        PyErr_Format(PyExc_TypeError,
                     "character mapping must return integer, bytes or None, not %.400s",
                     x->ob_type->tp_name);
        Py_DECREF(x);
        return NULL;
5614
    }
5615 5616
}

5617
static int
5618
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5619
{
5620 5621 5622 5623 5624 5625 5626
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
    /* exponentially overallocate to minimize reallocations */
    if (requiredsize < 2*outsize)
        requiredsize = 2*outsize;
    if (_PyBytes_Resize(outobj, requiredsize))
        return -1;
    return 0;
5627 5628
}

5629
typedef enum charmapencode_result {
Benjamin Peterson's avatar
Benjamin Peterson committed
5630
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5631
}charmapencode_result;
5632
/* lookup the character, put the result in the output string and adjust
5633
   various state variables. Resize the output bytes object if not enough
5634 5635 5636
   space is available. Return a new reference to the object that
   was put in the output buffer, or Py_None, if the mapping was undefined
   (in which case no character was written) or NULL, if a
5637
   reallocation error occurred. The caller must decref the result */
5638
static
5639
charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
Benjamin Peterson's avatar
Benjamin Peterson committed
5640
                                          PyObject **outobj, Py_ssize_t *outpos)
5641
{
5642 5643
    PyObject *rep;
    char *outstart;
5644
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5645

5646
    if (Py_TYPE(mapping) == &EncodingMapType) {
5647
        int res = encoding_map_lookup(c, mapping);
Benjamin Peterson's avatar
Benjamin Peterson committed
5648
        Py_ssize_t requiredsize = *outpos+1;
5649 5650
        if (res == -1)
            return enc_FAILED;
Benjamin Peterson's avatar
Benjamin Peterson committed
5651 5652 5653
        if (outsize<requiredsize)
            if (charmapencode_resize(outobj, outpos, requiredsize))
                return enc_EXCEPTION;
5654
        outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson's avatar
Benjamin Peterson committed
5655 5656
        outstart[(*outpos)++] = (char)res;
        return enc_SUCCESS;
5657 5658 5659
    }

    rep = charmapencode_lookup(c, mapping);
5660
    if (rep==NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5661
        return enc_EXCEPTION;
5662
    else if (rep==Py_None) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5663 5664
        Py_DECREF(rep);
        return enc_FAILED;
5665
    } else {
Benjamin Peterson's avatar
Benjamin Peterson committed
5666 5667 5668 5669 5670 5671 5672
        if (PyLong_Check(rep)) {
            Py_ssize_t requiredsize = *outpos+1;
            if (outsize<requiredsize)
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
                    Py_DECREF(rep);
                    return enc_EXCEPTION;
                }
5673
            outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson's avatar
Benjamin Peterson committed
5674
            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5675
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
5676 5677 5678 5679 5680 5681 5682 5683 5684
        else {
            const char *repchars = PyBytes_AS_STRING(rep);
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
            Py_ssize_t requiredsize = *outpos+repsize;
            if (outsize<requiredsize)
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
                    Py_DECREF(rep);
                    return enc_EXCEPTION;
                }
5685
            outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson's avatar
Benjamin Peterson committed
5686 5687 5688
            memcpy(outstart + *outpos, repchars, repsize);
            *outpos += repsize;
        }
5689
    }
5690 5691
    Py_DECREF(rep);
    return enc_SUCCESS;
5692 5693 5694 5695 5696 5697
}

/* handle an error in PyUnicode_EncodeCharmap
   Return 0 on success, -1 on error */
static
int charmap_encoding_error(
Martin v. Löwis's avatar
Martin v. Löwis committed
5698
    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5699
    PyObject **exceptionObject,
5700
    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5701
    PyObject **res, Py_ssize_t *respos)
5702 5703
{
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis's avatar
Martin v. Löwis committed
5704 5705
    Py_ssize_t repsize;
    Py_ssize_t newpos;
5706 5707
    Py_UNICODE *uni2;
    /* startpos for collecting unencodable chars */
Martin v. Löwis's avatar
Martin v. Löwis committed
5708 5709 5710
    Py_ssize_t collstartpos = *inpos;
    Py_ssize_t collendpos = *inpos+1;
    Py_ssize_t collpos;
5711 5712
    char *encoding = "charmap";
    char *reason = "character maps to <undefined>";
5713
    charmapencode_result x;
5714 5715 5716

    /* find all unencodable characters */
    while (collendpos < size) {
5717
        PyObject *rep;
5718
        if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5719 5720 5721 5722 5723 5724
            int res = encoding_map_lookup(p[collendpos], mapping);
            if (res != -1)
                break;
            ++collendpos;
            continue;
        }
5725

Benjamin Peterson's avatar
Benjamin Peterson committed
5726 5727 5728 5729 5730 5731 5732
        rep = charmapencode_lookup(p[collendpos], mapping);
        if (rep==NULL)
            return -1;
        else if (rep!=Py_None) {
            Py_DECREF(rep);
            break;
        }
5733
        Py_DECREF(rep);
Benjamin Peterson's avatar
Benjamin Peterson committed
5734
        ++collendpos;
5735 5736 5737 5738
    }
    /* cache callback name lookup
     * (if not done yet, i.e. it's the first error) */
    if (*known_errorHandler==-1) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5739 5740 5741 5742 5743 5744 5745 5746 5747 5748
        if ((errors==NULL) || (!strcmp(errors, "strict")))
            *known_errorHandler = 1;
        else if (!strcmp(errors, "replace"))
            *known_errorHandler = 2;
        else if (!strcmp(errors, "ignore"))
            *known_errorHandler = 3;
        else if (!strcmp(errors, "xmlcharrefreplace"))
            *known_errorHandler = 4;
        else
            *known_errorHandler = 0;
5749 5750
    }
    switch (*known_errorHandler) {
5751 5752 5753 5754 5755
    case 1: /* strict */
        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
        return -1;
    case 2: /* replace */
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5756 5757 5758 5759 5760 5761 5762 5763
            x = charmapencode_output('?', mapping, res, respos);
            if (x==enc_EXCEPTION) {
                return -1;
            }
            else if (x==enc_FAILED) {
                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
                return -1;
            }
5764 5765 5766 5767 5768 5769 5770 5771
        }
        /* fall through */
    case 3: /* ignore */
        *inpos = collendpos;
        break;
    case 4: /* xmlcharrefreplace */
        /* generate replacement (temporarily (mis)uses p) */
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782
            char buffer[2+29+1+1];
            char *cp;
            sprintf(buffer, "&#%d;", (int)p[collpos]);
            for (cp = buffer; *cp; ++cp) {
                x = charmapencode_output(*cp, mapping, res, respos);
                if (x==enc_EXCEPTION)
                    return -1;
                else if (x==enc_FAILED) {
                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
                    return -1;
                }
5783 5784 5785 5786 5787 5788
            }
        }
        *inpos = collendpos;
        break;
    default:
        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson's avatar
Benjamin Peterson committed
5789 5790
                                                      encoding, reason, p, size, exceptionObject,
                                                      collstartpos, collendpos, &newpos);
5791
        if (repunicode == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5792
            return -1;
5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808
        if (PyBytes_Check(repunicode)) {
            /* Directly copy bytes result to output. */
            Py_ssize_t outsize = PyBytes_Size(*res);
            Py_ssize_t requiredsize;
            repsize = PyBytes_Size(repunicode);
            requiredsize = *respos + repsize;
            if (requiredsize > outsize)
                /* Make room for all additional bytes. */
                if (charmapencode_resize(res, respos, requiredsize)) {
                    Py_DECREF(repunicode);
                    return -1;
                }
            memcpy(PyBytes_AsString(*res) + *respos,
                   PyBytes_AsString(repunicode),  repsize);
            *respos += repsize;
            *inpos = newpos;
5809
            Py_DECREF(repunicode);
5810
            break;
5811
        }
5812 5813 5814
        /* generate replacement  */
        repsize = PyUnicode_GET_SIZE(repunicode);
        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5815 5816 5817 5818 5819 5820 5821 5822 5823
            x = charmapencode_output(*uni2, mapping, res, respos);
            if (x==enc_EXCEPTION) {
                return -1;
            }
            else if (x==enc_FAILED) {
                Py_DECREF(repunicode);
                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
                return -1;
            }
5824 5825 5826
        }
        *inpos = newpos;
        Py_DECREF(repunicode);
5827 5828
    }
    return 0;
5829 5830 5831
}

PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
Benjamin Peterson's avatar
Benjamin Peterson committed
5832 5833 5834
                                  Py_ssize_t size,
                                  PyObject *mapping,
                                  const char *errors)
5835
{
5836 5837 5838
    /* output object */
    PyObject *res = NULL;
    /* current input position */
Martin v. Löwis's avatar
Martin v. Löwis committed
5839
    Py_ssize_t inpos = 0;
5840
    /* current output position */
Martin v. Löwis's avatar
Martin v. Löwis committed
5841
    Py_ssize_t respos = 0;
5842 5843 5844 5845 5846 5847
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    /* the following variable is used for caching string comparisons
     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
     * 3=ignore, 4=xmlcharrefreplace */
    int known_errorHandler = -1;
5848 5849 5850

    /* Default to Latin-1 */
    if (mapping == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5851
        return PyUnicode_EncodeLatin1(p, size, errors);
5852

5853 5854
    /* allocate enough for a simple encoding without
       replacements, if we need more, we'll resize */
5855
    res = PyBytes_FromStringAndSize(NULL, size);
5856 5857
    if (res == NULL)
        goto onError;
5858
    if (size == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
5859
        return res;
5860

5861
    while (inpos<size) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872
        /* try to encode it */
        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
        if (x==enc_EXCEPTION) /* error */
            goto onError;
        if (x==enc_FAILED) { /* unencodable character */
            if (charmap_encoding_error(p, size, &inpos, mapping,
                                       &exc,
                                       &known_errorHandler, &errorHandler, errors,
                                       &res, &respos)) {
                goto onError;
            }
5873
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
5874 5875 5876
        else
            /* done with this character => adjust input position */
            ++inpos;
5877
    }
5878

5879
    /* Resize if we allocated to much */
5880
    if (respos<PyBytes_GET_SIZE(res))
5881 5882
        if (_PyBytes_Resize(&res, respos) < 0)
            goto onError;
5883

5884 5885 5886
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
    return res;
5887

Benjamin Peterson's avatar
Benjamin Peterson committed
5888
  onError:
5889 5890 5891
    Py_XDECREF(res);
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
5892 5893 5894 5895
    return NULL;
}

PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
Benjamin Peterson's avatar
Benjamin Peterson committed
5896
                                    PyObject *mapping)
5897 5898
{
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5899 5900
        PyErr_BadArgument();
        return NULL;
5901 5902
    }
    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson's avatar
Benjamin Peterson committed
5903 5904 5905
                                   PyUnicode_GET_SIZE(unicode),
                                   mapping,
                                   NULL);
5906 5907
}

5908 5909
/* create or adjust a UnicodeTranslateError */
static void make_translate_exception(PyObject **exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
5910 5911 5912
                                     const Py_UNICODE *unicode, Py_ssize_t size,
                                     Py_ssize_t startpos, Py_ssize_t endpos,
                                     const char *reason)
5913 5914
{
    if (*exceptionObject == NULL) {
5915
        *exceptionObject = PyUnicodeTranslateError_Create(
Benjamin Peterson's avatar
Benjamin Peterson committed
5916
            unicode, size, startpos, endpos, reason);
5917 5918
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
5919 5920 5921 5922 5923 5924 5925 5926 5927 5928
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
            goto onError;
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
            goto onError;
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
            goto onError;
        return;
      onError:
        Py_DECREF(*exceptionObject);
        *exceptionObject = NULL;
5929 5930 5931 5932 5933
    }
}

/* raises a UnicodeTranslateError */
static void raise_translate_exception(PyObject **exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
5934 5935 5936
                                      const Py_UNICODE *unicode, Py_ssize_t size,
                                      Py_ssize_t startpos, Py_ssize_t endpos,
                                      const char *reason)
5937 5938
{
    make_translate_exception(exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
5939
                             unicode, size, startpos, endpos, reason);
5940
    if (*exceptionObject != NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5941
        PyCodec_StrictErrors(*exceptionObject);
5942 5943 5944 5945 5946 5947 5948
}

/* error handling callback helper:
   build arguments, call the callback and check the arguments,
   put the result into newpos and return the replacement string, which
   has to be freed by the caller */
static PyObject *unicode_translate_call_errorhandler(const char *errors,
Benjamin Peterson's avatar
Benjamin Peterson committed
5949 5950 5951 5952 5953
                                                     PyObject **errorHandler,
                                                     const char *reason,
                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
                                                     Py_ssize_t startpos, Py_ssize_t endpos,
                                                     Py_ssize_t *newpos)
5954
{
5955
    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5956

5957
    Py_ssize_t i_newpos;
5958 5959 5960 5961
    PyObject *restuple;
    PyObject *resunicode;

    if (*errorHandler == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5962
        *errorHandler = PyCodec_LookupError(errors);
5963
        if (*errorHandler == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5964
            return NULL;
5965 5966 5967
    }

    make_translate_exception(exceptionObject,
Benjamin Peterson's avatar
Benjamin Peterson committed
5968
                             unicode, size, startpos, endpos, reason);
5969
    if (*exceptionObject == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5970
        return NULL;
5971 5972

    restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson's avatar
Benjamin Peterson committed
5973
        *errorHandler, *exceptionObject, NULL);
5974
    if (restuple == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
5975
        return NULL;
5976
    if (!PyTuple_Check(restuple)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5977
        PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson's avatar
Benjamin Peterson committed
5978 5979
        Py_DECREF(restuple);
        return NULL;
5980 5981
    }
    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson's avatar
Benjamin Peterson committed
5982 5983 5984
                          &resunicode, &i_newpos)) {
        Py_DECREF(restuple);
        return NULL;
5985
    }
Martin v. Löwis's avatar
Martin v. Löwis committed
5986
    if (i_newpos<0)
Benjamin Peterson's avatar
Benjamin Peterson committed
5987
        *newpos = size+i_newpos;
Martin v. Löwis's avatar
Martin v. Löwis committed
5988 5989
    else
        *newpos = i_newpos;
5990
    if (*newpos<0 || *newpos>size) {
Benjamin Peterson's avatar
Benjamin Peterson committed
5991 5992 5993
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
        Py_DECREF(restuple);
        return NULL;
5994
    }
5995 5996 5997 5998 5999 6000 6001 6002
    Py_INCREF(resunicode);
    Py_DECREF(restuple);
    return resunicode;
}

/* Lookup the character ch in the mapping and put the result in result,
   which must be decrefed by the caller.
   Return 0 on success, -1 on error */
6003
static
6004 6005
int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
{
6006
    PyObject *w = PyLong_FromLong((long)c);
6007 6008 6009
    PyObject *x;

    if (w == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
6010
        return -1;
6011 6012 6013
    x = PyObject_GetItem(mapping, w);
    Py_DECREF(w);
    if (x == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6014 6015 6016 6017 6018 6019 6020
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
            /* No mapping found means: use 1:1 mapping. */
            PyErr_Clear();
            *result = NULL;
            return 0;
        } else
            return -1;
6021
    }
6022
    else if (x == Py_None) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6023 6024
        *result = x;
        return 0;
6025
    }
6026
    else if (PyLong_Check(x)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6027 6028 6029 6030
        long value = PyLong_AS_LONG(x);
        long max = PyUnicode_GetMax();
        if (value < 0 || value > max) {
            PyErr_Format(PyExc_TypeError,
6031
                         "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson's avatar
Benjamin Peterson committed
6032 6033 6034 6035 6036
            Py_DECREF(x);
            return -1;
        }
        *result = x;
        return 0;
6037 6038
    }
    else if (PyUnicode_Check(x)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6039 6040
        *result = x;
        return 0;
6041 6042
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
6043 6044 6045 6046 6047
        /* wrong return value */
        PyErr_SetString(PyExc_TypeError,
                        "character mapping must return integer, None or str");
        Py_DECREF(x);
        return -1;
6048 6049
    }
}
6050
/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson's avatar
Benjamin Peterson committed
6051 6052
   if not reallocate and adjust various state variables.
   Return 0 on success, -1 on error */
6053
static
6054
int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Benjamin Peterson's avatar
Benjamin Peterson committed
6055
                               Py_ssize_t requiredsize)
6056
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6057
    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
6058
    if (requiredsize > oldsize) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6059 6060 6061 6062 6063 6064 6065 6066
        /* remember old output position */
        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
        /* exponentially overallocate to minimize reallocations */
        if (requiredsize < 2 * oldsize)
            requiredsize = 2 * oldsize;
        if (PyUnicode_Resize(outobj, requiredsize) < 0)
            return -1;
        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
6067 6068 6069 6070 6071 6072 6073 6074 6075 6076
    }
    return 0;
}
/* lookup the character, put the result in the output string and adjust
   various state variables. Return a new reference to the object that
   was put in the output buffer in *result, or Py_None, if the mapping was
   undefined (in which case no character was written).
   The called must decref result.
   Return 0 on success, -1 on error. */
static
6077
int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
Benjamin Peterson's avatar
Benjamin Peterson committed
6078 6079
                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
                            PyObject **res)
6080
{
6081
    if (charmaptranslate_lookup(*curinp, mapping, res))
Benjamin Peterson's avatar
Benjamin Peterson committed
6082
        return -1;
6083
    if (*res==NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6084 6085
        /* not found => default to 1:1 mapping */
        *(*outp)++ = *curinp;
6086 6087
    }
    else if (*res==Py_None)
Benjamin Peterson's avatar
Benjamin Peterson committed
6088
        ;
6089
    else if (PyLong_Check(*res)) {
6090
        /* no overflow check, because we know that the space is enough */
Benjamin Peterson's avatar
Benjamin Peterson committed
6091
        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
6092
    }
Benjamin Peterson's avatar
Benjamin Peterson committed
6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108
    else if (PyUnicode_Check(*res)) {
        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
        if (repsize==1) {
            /* no overflow check, because we know that the space is enough */
            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
        }
        else if (repsize!=0) {
            /* more than one character */
            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
                (insize - (curinp-startinp)) +
                repsize - 1;
            if (charmaptranslate_makespace(outobj, outp, requiredsize))
                return -1;
            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
            *outp += repsize;
        }
6109 6110
    }
    else
Benjamin Peterson's avatar
Benjamin Peterson committed
6111
        return -1;
6112 6113
    return 0;
}
6114

6115
PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Benjamin Peterson's avatar
Benjamin Peterson committed
6116 6117 6118
                                     Py_ssize_t size,
                                     PyObject *mapping,
                                     const char *errors)
6119
{
6120 6121 6122 6123 6124 6125 6126 6127
    /* output object */
    PyObject *res = NULL;
    /* pointers to the beginning and end+1 of input */
    const Py_UNICODE *startp = p;
    const Py_UNICODE *endp = p + size;
    /* pointer into the output */
    Py_UNICODE *str;
    /* current output position */
Martin v. Löwis's avatar
Martin v. Löwis committed
6128
    Py_ssize_t respos = 0;
6129 6130 6131 6132 6133 6134 6135 6136
    char *reason = "character maps to <undefined>";
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    /* the following variable is used for caching string comparisons
     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
     * 3=ignore, 4=xmlcharrefreplace */
    int known_errorHandler = -1;

6137
    if (mapping == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6138 6139
        PyErr_BadArgument();
        return NULL;
6140 6141
    }

6142 6143 6144 6145
    /* allocate enough for a simple 1:1 translation without
       replacements, if we need more, we'll resize */
    res = PyUnicode_FromUnicode(NULL, size);
    if (res == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
6146
        goto onError;
6147
    if (size == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
6148
        return res;
6149 6150 6151
    str = PyUnicode_AS_UNICODE(res);

    while (p<endp) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6152 6153 6154 6155
        /* try to encode it */
        PyObject *x = NULL;
        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
            Py_XDECREF(x);
6156 6157
            goto onError;
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178
        Py_XDECREF(x);
        if (x!=Py_None) /* it worked => adjust input pointer */
            ++p;
        else { /* untranslatable character */
            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
            Py_ssize_t repsize;
            Py_ssize_t newpos;
            Py_UNICODE *uni2;
            /* startpos for collecting untranslatable chars */
            const Py_UNICODE *collstart = p;
            const Py_UNICODE *collend = p+1;
            const Py_UNICODE *coll;

            /* find all untranslatable characters */
            while (collend < endp) {
                if (charmaptranslate_lookup(*collend, mapping, &x))
                    goto onError;
                Py_XDECREF(x);
                if (x!=Py_None)
                    break;
                ++collend;
6179
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236
            /* cache callback name lookup
             * (if not done yet, i.e. it's the first error) */
            if (known_errorHandler==-1) {
                if ((errors==NULL) || (!strcmp(errors, "strict")))
                    known_errorHandler = 1;
                else if (!strcmp(errors, "replace"))
                    known_errorHandler = 2;
                else if (!strcmp(errors, "ignore"))
                    known_errorHandler = 3;
                else if (!strcmp(errors, "xmlcharrefreplace"))
                    known_errorHandler = 4;
                else
                    known_errorHandler = 0;
            }
            switch (known_errorHandler) {
            case 1: /* strict */
                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
                goto onError;
            case 2: /* replace */
                /* No need to check for space, this is a 1:1 replacement */
                for (coll = collstart; coll<collend; ++coll)
                    *str++ = '?';
                /* fall through */
            case 3: /* ignore */
                p = collend;
                break;
            case 4: /* xmlcharrefreplace */
                /* generate replacement (temporarily (mis)uses p) */
                for (p = collstart; p < collend; ++p) {
                    char buffer[2+29+1+1];
                    char *cp;
                    sprintf(buffer, "&#%d;", (int)*p);
                    if (charmaptranslate_makespace(&res, &str,
                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
                        goto onError;
                    for (cp = buffer; *cp; ++cp)
                        *str++ = *cp;
                }
                p = collend;
                break;
            default:
                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
                                                                 reason, startp, size, &exc,
                                                                 collstart-startp, collend-startp, &newpos);
                if (repunicode == NULL)
                    goto onError;
                /* generate replacement  */
                repsize = PyUnicode_GET_SIZE(repunicode);
                if (charmaptranslate_makespace(&res, &str,
                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
                    Py_DECREF(repunicode);
                    goto onError;
                }
                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
                    *str++ = *uni2;
                p = startp + newpos;
                Py_DECREF(repunicode);
6237 6238 6239
            }
        }
    }
6240 6241
    /* Resize if we allocated to much */
    respos = str-PyUnicode_AS_UNICODE(res);
6242
    if (respos<PyUnicode_GET_SIZE(res)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6243 6244
        if (PyUnicode_Resize(&res, respos) < 0)
            goto onError;
6245 6246 6247 6248
    }
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
    return res;
6249

Benjamin Peterson's avatar
Benjamin Peterson committed
6250
  onError:
6251 6252 6253
    Py_XDECREF(res);
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
6254 6255 6256 6257
    return NULL;
}

PyObject *PyUnicode_Translate(PyObject *str,
Benjamin Peterson's avatar
Benjamin Peterson committed
6258 6259
                              PyObject *mapping,
                              const char *errors)
6260 6261
{
    PyObject *result;
6262

6263 6264
    str = PyUnicode_FromObject(str);
    if (str == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
6265
        goto onError;
6266
    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
Benjamin Peterson's avatar
Benjamin Peterson committed
6267 6268 6269
                                        PyUnicode_GET_SIZE(str),
                                        mapping,
                                        errors);
6270 6271
    Py_DECREF(str);
    return result;
6272

Benjamin Peterson's avatar
Benjamin Peterson committed
6273
  onError:
6274 6275 6276
    Py_XDECREF(str);
    return NULL;
}
6277

6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301
PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
                                  Py_ssize_t length)
{
    PyObject *result;
    Py_UNICODE *p; /* write pointer into result */
    Py_ssize_t i;
    /* Copy to a new string */
    result = (PyObject *)_PyUnicode_New(length);
    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
    if (result == NULL)
        return result;
    p = PyUnicode_AS_UNICODE(result);
    /* Iterate over code points */
    for (i = 0; i < length; i++) {
        Py_UNICODE ch =s[i];
        if (ch > 127) {
            int decimal = Py_UNICODE_TODECIMAL(ch);
            if (decimal >= 0)
                p[i] = '0' + decimal;
        }
    }
    return result;
}
6302 6303 6304
/* --- Decimal Encoder ---------------------------------------------------- */

int PyUnicode_EncodeDecimal(Py_UNICODE *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
6305 6306 6307
                            Py_ssize_t length,
                            char *output,
                            const char *errors)
6308 6309
{
    Py_UNICODE *p, *end;
6310 6311 6312 6313 6314 6315 6316
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    const char *encoding = "decimal";
    const char *reason = "invalid decimal Unicode string";
    /* the following variable is used for caching string comparisons
     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
    int known_errorHandler = -1;
6317 6318

    if (output == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6319 6320
        PyErr_BadArgument();
        return -1;
6321 6322 6323 6324 6325
    }

    p = s;
    end = s + length;
    while (p < end) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6326 6327 6328 6329 6330 6331 6332 6333
        register Py_UNICODE ch = *p;
        int decimal;
        PyObject *repunicode;
        Py_ssize_t repsize;
        Py_ssize_t newpos;
        Py_UNICODE *uni2;
        Py_UNICODE *collstart;
        Py_UNICODE *collend;
6334

Benjamin Peterson's avatar
Benjamin Peterson committed
6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352
        if (Py_UNICODE_ISSPACE(ch)) {
            *output++ = ' ';
            ++p;
            continue;
        }
        decimal = Py_UNICODE_TODECIMAL(ch);
        if (decimal >= 0) {
            *output++ = '0' + decimal;
            ++p;
            continue;
        }
        if (0 < ch && ch < 256) {
            *output++ = (char)ch;
            ++p;
            continue;
        }
        /* All other characters are considered unencodable */
        collstart = p;
6353
        for (collend = p+1; collend < end; collend++) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6354
            if ((0 < *collend && *collend < 256) ||
6355 6356
                Py_UNICODE_ISSPACE(*collend) ||
                0 <= Py_UNICODE_TODECIMAL(*collend))
Benjamin Peterson's avatar
Benjamin Peterson committed
6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373
                break;
        }
        /* cache callback name lookup
         * (if not done yet, i.e. it's the first error) */
        if (known_errorHandler==-1) {
            if ((errors==NULL) || (!strcmp(errors, "strict")))
                known_errorHandler = 1;
            else if (!strcmp(errors, "replace"))
                known_errorHandler = 2;
            else if (!strcmp(errors, "ignore"))
                known_errorHandler = 3;
            else if (!strcmp(errors, "xmlcharrefreplace"))
                known_errorHandler = 4;
            else
                known_errorHandler = 0;
        }
        switch (known_errorHandler) {
6374
        case 1: /* strict */
Benjamin Peterson's avatar
Benjamin Peterson committed
6375 6376
            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
            goto onError;
6377
        case 2: /* replace */
Benjamin Peterson's avatar
Benjamin Peterson committed
6378 6379 6380
            for (p = collstart; p < collend; ++p)
                *output++ = '?';
            /* fall through */
6381
        case 3: /* ignore */
Benjamin Peterson's avatar
Benjamin Peterson committed
6382 6383
            p = collend;
            break;
6384
        case 4: /* xmlcharrefreplace */
Benjamin Peterson's avatar
Benjamin Peterson committed
6385 6386 6387 6388 6389
            /* generate replacement (temporarily (mis)uses p) */
            for (p = collstart; p < collend; ++p)
                output += sprintf(output, "&#%d;", (int)*p);
            p = collend;
            break;
6390
        default:
Benjamin Peterson's avatar
Benjamin Peterson committed
6391 6392 6393 6394
            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
                                                          encoding, reason, s, length, &exc,
                                                          collstart-s, collend-s, &newpos);
            if (repunicode == NULL)
6395
                goto onError;
6396
            if (!PyUnicode_Check(repunicode)) {
6397
                /* Byte results not supported, since they have no decimal property. */
6398 6399 6400 6401
                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
                Py_DECREF(repunicode);
                goto onError;
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420
            /* generate replacement  */
            repsize = PyUnicode_GET_SIZE(repunicode);
            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
                Py_UNICODE ch = *uni2;
                if (Py_UNICODE_ISSPACE(ch))
                    *output++ = ' ';
                else {
                    decimal = Py_UNICODE_TODECIMAL(ch);
                    if (decimal >= 0)
                        *output++ = '0' + decimal;
                    else if (0 < ch && ch < 256)
                        *output++ = (char)ch;
                    else {
                        Py_DECREF(repunicode);
                        raise_encode_exception(&exc, encoding,
                                               s, length, collstart-s, collend-s, reason);
                        goto onError;
                    }
                }
6421
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
6422 6423
            p = s + newpos;
            Py_DECREF(repunicode);
6424
        }
6425 6426 6427
    }
    /* 0-terminate the output string */
    *output++ = '\0';
6428 6429
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
6430 6431
    return 0;

Benjamin Peterson's avatar
Benjamin Peterson committed
6432
  onError:
6433 6434
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
6435 6436 6437
    return -1;
}

6438 6439
/* --- Helpers ------------------------------------------------------------ */

6440
#include "stringlib/unicodedefs.h"
6441
#include "stringlib/fastsearch.h"
6442

6443 6444 6445
#include "stringlib/count.h"
#include "stringlib/find.h"
#include "stringlib/partition.h"
6446
#include "stringlib/split.h"
6447

6448
#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6449
#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6450 6451
#include "stringlib/localeutil.h"

6452
/* helper macro to fixup start/end slice values */
6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465
#define ADJUST_INDICES(start, end, len)         \
    if (end > len)                              \
        end = len;                              \
    else if (end < 0) {                         \
        end += len;                             \
        if (end < 0)                            \
            end = 0;                            \
    }                                           \
    if (start < 0) {                            \
        start += len;                           \
        if (start < 0)                          \
            start = 0;                          \
    }
6466

6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496
/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
 * by 'ptr', possibly combining surrogate pairs on narrow builds.
 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
 * that should be returned and 'end' pointing to the end of the buffer.
 * ('end' is used on narrow builds to detect a lone surrogate at the
 * end of the buffer that should be returned unchanged.)
 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
 * The type of the returned char is always Py_UCS4.
 *
 * Note: the macro advances ptr to next char, so it might have side-effects
 *       (especially if used with other macros).
 */

/* helper macros used by _Py_UNICODE_NEXT */
#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)

#ifdef Py_UNICODE_WIDE
#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
#else
#define _Py_UNICODE_NEXT(ptr, end)                                      \
     (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
        _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
       ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
       (Py_UCS4)*(ptr)++)
#endif
6497

Martin v. Löwis's avatar
Martin v. Löwis committed
6498
Py_ssize_t PyUnicode_Count(PyObject *str,
6499 6500 6501
                           PyObject *substr,
                           Py_ssize_t start,
                           Py_ssize_t end)
6502
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6503
    Py_ssize_t result;
6504 6505
    PyUnicodeObject* str_obj;
    PyUnicodeObject* sub_obj;
6506

6507 6508
    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
    if (!str_obj)
Benjamin Peterson's avatar
Benjamin Peterson committed
6509
        return -1;
6510 6511
    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
    if (!sub_obj) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6512 6513
        Py_DECREF(str_obj);
        return -1;
6514
    }
6515

6516
    ADJUST_INDICES(start, end, str_obj->length);
6517
    result = stringlib_count(
6518 6519
        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
        PY_SSIZE_T_MAX
6520
        );
6521

6522 6523
    Py_DECREF(sub_obj);
    Py_DECREF(str_obj);
6524

6525
    return result;
6526 6527
}

Martin v. Löwis's avatar
Martin v. Löwis committed
6528
Py_ssize_t PyUnicode_Find(PyObject *str,
6529 6530 6531 6532
                          PyObject *sub,
                          Py_ssize_t start,
                          Py_ssize_t end,
                          int direction)
6533
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6534
    Py_ssize_t result;
6535

6536
    str = PyUnicode_FromObject(str);
6537
    if (!str)
Benjamin Peterson's avatar
Benjamin Peterson committed
6538
        return -2;
6539 6540
    sub = PyUnicode_FromObject(sub);
    if (!sub) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6541 6542
        Py_DECREF(str);
        return -2;
6543
    }
6544

6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557
    if (direction > 0)
        result = stringlib_find_slice(
            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
            start, end
            );
    else
        result = stringlib_rfind_slice(
            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
            start, end
            );

6558
    Py_DECREF(str);
6559 6560
    Py_DECREF(sub);

6561 6562 6563
    return result;
}

6564
static
6565
int tailmatch(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
6566 6567 6568 6569
              PyUnicodeObject *substring,
              Py_ssize_t start,
              Py_ssize_t end,
              int direction)
6570 6571 6572 6573
{
    if (substring->length == 0)
        return 1;

6574
    ADJUST_INDICES(start, end, self->length);
6575 6576
    end -= substring->length;
    if (end < start)
Benjamin Peterson's avatar
Benjamin Peterson committed
6577
        return 0;
6578 6579

    if (direction > 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6580 6581
        if (Py_UNICODE_MATCH(self, end, substring))
            return 1;
6582 6583
    } else {
        if (Py_UNICODE_MATCH(self, start, substring))
Benjamin Peterson's avatar
Benjamin Peterson committed
6584
            return 1;
6585 6586 6587 6588 6589
    }

    return 0;
}

Martin v. Löwis's avatar
Martin v. Löwis committed
6590
Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
Benjamin Peterson's avatar
Benjamin Peterson committed
6591 6592 6593 6594
                               PyObject *substr,
                               Py_ssize_t start,
                               Py_ssize_t end,
                               int direction)
6595
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6596
    Py_ssize_t result;
6597

6598 6599
    str = PyUnicode_FromObject(str);
    if (str == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
6600
        return -1;
6601 6602
    substr = PyUnicode_FromObject(substr);
    if (substr == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6603 6604
        Py_DECREF(str);
        return -1;
6605
    }
6606

6607
    result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson's avatar
Benjamin Peterson committed
6608 6609
                       (PyUnicodeObject *)substr,
                       start, end, direction);
6610 6611 6612 6613 6614 6615 6616 6617
    Py_DECREF(str);
    Py_DECREF(substr);
    return result;
}

/* Apply fixfct filter to the Unicode object self and return a
   reference to the modified object */

6618
static
6619
PyObject *fixup(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
6620
                int (*fixfct)(PyUnicodeObject *s))
6621 6622 6623 6624
{

    PyUnicodeObject *u;

6625
    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6626
    if (u == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
6627
        return NULL;
6628 6629 6630

    Py_UNICODE_COPY(u->str, self->str, self->length);

6631
    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6632 6633 6634 6635 6636 6637
        /* fixfct should return TRUE if it modified the buffer. If
           FALSE, return a reference to the original buffer instead
           (to save space, not time) */
        Py_INCREF(self);
        Py_DECREF(u);
        return (PyObject*) self;
6638 6639 6640 6641
    }
    return (PyObject*) u;
}

6642
static
6643 6644
int fixupper(PyUnicodeObject *self)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6645
    Py_ssize_t len = self->length;
6646 6647
    Py_UNICODE *s = self->str;
    int status = 0;
6648

6649
    while (len-- > 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6650
        register Py_UNICODE ch;
6651

Benjamin Peterson's avatar
Benjamin Peterson committed
6652 6653
        ch = Py_UNICODE_TOUPPER(*s);
        if (ch != *s) {
6654
            status = 1;
Benjamin Peterson's avatar
Benjamin Peterson committed
6655 6656
            *s = ch;
        }
6657 6658 6659 6660 6661 6662
        s++;
    }

    return status;
}

6663
static
6664 6665
int fixlower(PyUnicodeObject *self)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6666
    Py_ssize_t len = self->length;
6667 6668
    Py_UNICODE *s = self->str;
    int status = 0;
6669

6670
    while (len-- > 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6671
        register Py_UNICODE ch;
6672

Benjamin Peterson's avatar
Benjamin Peterson committed
6673 6674
        ch = Py_UNICODE_TOLOWER(*s);
        if (ch != *s) {
6675
            status = 1;
Benjamin Peterson's avatar
Benjamin Peterson committed
6676 6677
            *s = ch;
        }
6678 6679 6680 6681 6682 6683
        s++;
    }

    return status;
}

6684
static
6685 6686
int fixswapcase(PyUnicodeObject *self)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6687
    Py_ssize_t len = self->length;
6688 6689
    Py_UNICODE *s = self->str;
    int status = 0;
6690

6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704
    while (len-- > 0) {
        if (Py_UNICODE_ISUPPER(*s)) {
            *s = Py_UNICODE_TOLOWER(*s);
            status = 1;
        } else if (Py_UNICODE_ISLOWER(*s)) {
            *s = Py_UNICODE_TOUPPER(*s);
            status = 1;
        }
        s++;
    }

    return status;
}

6705
static
6706 6707
int fixcapitalize(PyUnicodeObject *self)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
6708
    Py_ssize_t len = self->length;
6709 6710
    Py_UNICODE *s = self->str;
    int status = 0;
6711

6712
    if (len == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
6713
        return 0;
6714
    if (!Py_UNICODE_ISUPPER(*s)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6715 6716
        *s = Py_UNICODE_TOUPPER(*s);
        status = 1;
6717
    }
6718 6719
    s++;
    while (--len > 0) {
6720
        if (!Py_UNICODE_ISLOWER(*s)) {
6721 6722 6723 6724 6725 6726
            *s = Py_UNICODE_TOLOWER(*s);
            status = 1;
        }
        s++;
    }
    return status;
6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737
}

static
int fixtitle(PyUnicodeObject *self)
{
    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register Py_UNICODE *e;
    int previous_is_cased;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6738 6739 6740 6741 6742 6743 6744
        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
        if (*p != ch) {
            *p = ch;
            return 1;
        }
        else
            return 0;
6745
    }
6746

6747 6748 6749
    e = p + PyUnicode_GET_SIZE(self);
    previous_is_cased = 0;
    for (; p < e; p++) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6750
        register const Py_UNICODE ch = *p;
6751

Benjamin Peterson's avatar
Benjamin Peterson committed
6752 6753 6754 6755
        if (previous_is_cased)
            *p = Py_UNICODE_TOLOWER(ch);
        else
            *p = Py_UNICODE_TOTITLE(ch);
6756

Benjamin Peterson's avatar
Benjamin Peterson committed
6757 6758 6759 6760 6761 6762
        if (Py_UNICODE_ISLOWER(ch) ||
            Py_UNICODE_ISUPPER(ch) ||
            Py_UNICODE_ISTITLE(ch))
            previous_is_cased = 1;
        else
            previous_is_cased = 0;
6763 6764 6765 6766
    }
    return 1;
}

6767 6768
PyObject *
PyUnicode_Join(PyObject *separator, PyObject *seq)
6769
{
6770 6771
    const Py_UNICODE blank = ' ';
    const Py_UNICODE *sep = &blank;
6772
    Py_ssize_t seplen = 1;
6773 6774 6775
    PyUnicodeObject *res = NULL; /* the result */
    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
    PyObject *fseq;          /* PySequence_Fast(seq) */
6776 6777
    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
    PyObject **items;
6778
    PyObject *item;
6779
    Py_ssize_t sz, i;
6780

6781 6782
    fseq = PySequence_Fast(seq, "");
    if (fseq == NULL) {
6783
        return NULL;
6784 6785
    }

6786 6787
    /* NOTE: the following code can't call back into Python code,
     * so we are sure that fseq won't be mutated.
6788
     */
6789

6790 6791 6792
    seqlen = PySequence_Fast_GET_SIZE(fseq);
    /* If empty sequence, return u"". */
    if (seqlen == 0) {
6793 6794
        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
        goto Done;
6795
    }
6796
    items = PySequence_Fast_ITEMS(fseq);
6797 6798
    /* If singleton sequence with an exact Unicode, return that. */
    if (seqlen == 1) {
Benjamin Peterson's avatar
Benjamin Peterson committed
6799 6800 6801 6802 6803 6804
        item = items[0];
        if (PyUnicode_CheckExact(item)) {
            Py_INCREF(item);
            res = (PyUnicodeObject *)item;
            goto Done;
        }
6805
    }
6806 6807 6808 6809 6810
    else {
        /* Set up sep and seplen */
        if (separator == NULL) {
            sep = &blank;
            seplen = 1;
6811
        }
6812 6813 6814 6815 6816 6817 6818 6819 6820 6821
        else {
            if (!PyUnicode_Check(separator)) {
                PyErr_Format(PyExc_TypeError,
                             "separator: expected str instance,"
                             " %.80s found",
                             Py_TYPE(separator)->tp_name);
                goto onError;
            }
            sep = PyUnicode_AS_UNICODE(separator);
            seplen = PyUnicode_GET_SIZE(separator);
6822
        }
6823 6824
    }

6825 6826 6827 6828 6829 6830 6831 6832 6833
    /* There are at least two things to join, or else we have a subclass
     * of str in the sequence.
     * Do a pre-pass to figure out the total amount of space we'll
     * need (sz), and see whether all argument are strings.
     */
    sz = 0;
    for (i = 0; i < seqlen; i++) {
        const Py_ssize_t old_sz = sz;
        item = items[i];
Benjamin Peterson's avatar
Benjamin Peterson committed
6834 6835 6836 6837 6838 6839 6840
        if (!PyUnicode_Check(item)) {
            PyErr_Format(PyExc_TypeError,
                         "sequence item %zd: expected str instance,"
                         " %.80s found",
                         i, Py_TYPE(item)->tp_name);
            goto onError;
        }
6841 6842 6843 6844 6845
        sz += PyUnicode_GET_SIZE(item);
        if (i != 0)
            sz += seplen;
        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
            PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson's avatar
Benjamin Peterson committed
6846
                            "join() result is too long for a Python string");
6847 6848 6849
            goto onError;
        }
    }
6850

6851 6852 6853 6854 6855 6856 6857 6858 6859 6860
    res = _PyUnicode_New(sz);
    if (res == NULL)
        goto onError;

    /* Catenate everything. */
    res_p = PyUnicode_AS_UNICODE(res);
    for (i = 0; i < seqlen; ++i) {
        Py_ssize_t itemlen;
        item = items[i];
        itemlen = PyUnicode_GET_SIZE(item);
Benjamin Peterson's avatar
Benjamin Peterson committed
6861 6862 6863 6864 6865 6866 6867
        /* Copy item, and maybe the separator. */
        if (i) {
            Py_UNICODE_COPY(res_p, sep, seplen);
            res_p += seplen;
        }
        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
        res_p += itemlen;
6868
    }
6869

Benjamin Peterson's avatar
Benjamin Peterson committed
6870
  Done:
6871
    Py_DECREF(fseq);
6872 6873
    return (PyObject *)res;

Benjamin Peterson's avatar
Benjamin Peterson committed
6874
  onError:
6875
    Py_DECREF(fseq);
6876
    Py_XDECREF(res);
6877 6878 6879
    return NULL;
}

6880 6881
static
PyUnicodeObject *pad(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
6882 6883 6884
                     Py_ssize_t left,
                     Py_ssize_t right,
                     Py_UNICODE fill)
6885 6886 6887 6888 6889 6890 6891 6892
{
    PyUnicodeObject *u;

    if (left < 0)
        left = 0;
    if (right < 0)
        right = 0;

6893
    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6894 6895 6896 6897
        Py_INCREF(self);
        return self;
    }

Neal Norwitz's avatar
Neal Norwitz committed
6898 6899 6900 6901 6902
    if (left > PY_SSIZE_T_MAX - self->length ||
        right > PY_SSIZE_T_MAX - (left + self->length)) {
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
        return NULL;
    }
6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914
    u = _PyUnicode_New(left + self->length + right);
    if (u) {
        if (left)
            Py_UNICODE_FILL(u->str, fill, left);
        Py_UNICODE_COPY(u->str + left, self->str, self->length);
        if (right)
            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
    }

    return u;
}

6915
PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
6916 6917 6918 6919 6920
{
    PyObject *list;

    string = PyUnicode_FromObject(string);
    if (string == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
6921 6922
        return NULL;

6923 6924 6925
    list = stringlib_splitlines(
        (PyObject*) string, PyUnicode_AS_UNICODE(string),
        PyUnicode_GET_SIZE(string), keepends);
6926 6927

    Py_DECREF(string);
6928 6929 6930
    return list;
}

6931 6932
static
PyObject *split(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
6933 6934
                PyUnicodeObject *substring,
                Py_ssize_t maxcount)
6935 6936
{
    if (maxcount < 0)
6937
        maxcount = PY_SSIZE_T_MAX;
6938 6939

    if (substring == NULL)
6940 6941 6942
        return stringlib_split_whitespace(
            (PyObject*) self,  self->str, self->length, maxcount
            );
6943

6944 6945 6946 6947 6948
    return stringlib_split(
        (PyObject*) self,  self->str, self->length,
        substring->str, substring->length,
        maxcount
        );
6949 6950
}

6951 6952
static
PyObject *rsplit(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
6953 6954
                 PyUnicodeObject *substring,
                 Py_ssize_t maxcount)
6955 6956
{
    if (maxcount < 0)
6957
        maxcount = PY_SSIZE_T_MAX;
6958 6959

    if (substring == NULL)
6960 6961 6962
        return stringlib_rsplit_whitespace(
            (PyObject*) self,  self->str, self->length, maxcount
            );
6963

6964 6965 6966 6967 6968
    return stringlib_rsplit(
        (PyObject*) self,  self->str, self->length,
        substring->str, substring->length,
        maxcount
        );
6969 6970
}

6971
static
6972
PyObject *replace(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
6973 6974 6975
                  PyUnicodeObject *str1,
                  PyUnicodeObject *str2,
                  Py_ssize_t maxcount)
6976 6977 6978 6979
{
    PyUnicodeObject *u;

    if (maxcount < 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
6980
        maxcount = PY_SSIZE_T_MAX;
6981 6982
    else if (maxcount == 0 || self->length == 0)
        goto nothing;
6983

6984
    if (str1->length == str2->length) {
6985
        Py_ssize_t i;
6986
        /* same length */
6987 6988
        if (str1->length == 0)
            goto nothing;
6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005
        if (str1->length == 1) {
            /* replace characters */
            Py_UNICODE u1, u2;
            if (!findchar(self->str, self->length, str1->str[0]))
                goto nothing;
            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
            if (!u)
                return NULL;
            Py_UNICODE_COPY(u->str, self->str, self->length);
            u1 = str1->str[0];
            u2 = str2->str[0];
            for (i = 0; i < u->length; i++)
                if (u->str[i] == u1) {
                    if (--maxcount < 0)
                        break;
                    u->str[i] = u2;
                }
7006
        } else {
7007 7008
            i = stringlib_find(
                self->str, self->length, str1->str, str1->length, 0
7009
                );
7010 7011 7012 7013 7014 7015
            if (i < 0)
                goto nothing;
            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
            if (!u)
                return NULL;
            Py_UNICODE_COPY(u->str, self->str, self->length);
7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029

            /* change everything in-place, starting with this one */
            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
            i += str1->length;

            while ( --maxcount > 0) {
                i = stringlib_find(self->str+i, self->length-i,
                                   str1->str, str1->length,
                                   i);
                if (i == -1)
                    break;
                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
                i += str1->length;
            }
7030
        }
7031
    } else {
7032

7033
        Py_ssize_t n, i, j;
7034
        Py_ssize_t product, new_size, delta;
7035 7036 7037
        Py_UNICODE *p;

        /* replace strings */
7038 7039
        n = stringlib_count(self->str, self->length, str1->str, str1->length,
                            maxcount);
7040 7041 7042 7043 7044 7045
        if (n == 0)
            goto nothing;
        /* new_size = self->length + n * (str2->length - str1->length)); */
        delta = (str2->length - str1->length);
        if (delta == 0) {
            new_size = self->length;
7046
        } else {
7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067
            product = n * (str2->length - str1->length);
            if ((product / (str2->length - str1->length)) != n) {
                PyErr_SetString(PyExc_OverflowError,
                                "replace string is too long");
                return NULL;
            }
            new_size = self->length + product;
            if (new_size < 0) {
                PyErr_SetString(PyExc_OverflowError,
                                "replace string is too long");
                return NULL;
            }
        }
        u = _PyUnicode_New(new_size);
        if (!u)
            return NULL;
        i = 0;
        p = u->str;
        if (str1->length > 0) {
            while (n-- > 0) {
                /* look for next match */
7068 7069 7070 7071 7072 7073
                j = stringlib_find(self->str+i, self->length-i,
                                   str1->str, str1->length,
                                   i);
                if (j == -1)
                    break;
                else if (j > i) {
7074 7075 7076 7077 7078 7079 7080 7081
                    /* copy unchanged part [i:j] */
                    Py_UNICODE_COPY(p, self->str+i, j-i);
                    p += j - i;
                }
                /* copy substitution string */
                if (str2->length > 0) {
                    Py_UNICODE_COPY(p, str2->str, str2->length);
                    p += str2->length;
7082
                }
7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095
                i = j + str1->length;
            }
            if (i < self->length)
                /* copy tail [i:] */
                Py_UNICODE_COPY(p, self->str+i, self->length-i);
        } else {
            /* interleave */
            while (n > 0) {
                Py_UNICODE_COPY(p, str2->str, str2->length);
                p += str2->length;
                if (--n <= 0)
                    break;
                *p++ = self->str[i++];
7096
            }
7097
            Py_UNICODE_COPY(p, self->str+i, self->length-i);
7098 7099 7100
        }
    }
    return (PyObject *) u;
7101

Benjamin Peterson's avatar
Benjamin Peterson committed
7102
  nothing:
7103 7104 7105 7106 7107 7108
    /* nothing to replace; return original string (when possible) */
    if (PyUnicode_CheckExact(self)) {
        Py_INCREF(self);
        return (PyObject *) self;
    }
    return PyUnicode_FromUnicode(self->str, self->length);
7109 7110 7111 7112
}

/* --- Unicode Object Methods --------------------------------------------- */

7113
PyDoc_STRVAR(title__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7114
             "S.title() -> str\n\
7115 7116
\n\
Return a titlecased version of S, i.e. words start with title case\n\
7117
characters, all remaining cased characters have lower case.");
7118 7119

static PyObject*
7120
unicode_title(PyUnicodeObject *self)
7121 7122 7123 7124
{
    return fixup(self, fixtitle);
}

7125
PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7126
             "S.capitalize() -> str\n\
7127 7128
\n\
Return a capitalized version of S, i.e. make the first character\n\
7129
have upper case and the rest lower case.");
7130 7131

static PyObject*
7132
unicode_capitalize(PyUnicodeObject *self)
7133 7134 7135 7136 7137
{
    return fixup(self, fixcapitalize);
}

#if 0
7138
PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7139
             "S.capwords() -> str\n\
7140 7141
\n\
Apply .capitalize() to all words in S and return the result with\n\
7142
normalized whitespace (all whitespace strings are replaced by ' ').");
7143 7144

static PyObject*
7145
unicode_capwords(PyUnicodeObject *self)
7146 7147 7148
{
    PyObject *list;
    PyObject *item;
Martin v. Löwis's avatar
Martin v. Löwis committed
7149
    Py_ssize_t i;
7150 7151 7152 7153 7154 7155 7156 7157 7158

    /* Split into words */
    list = split(self, NULL, -1);
    if (!list)
        return NULL;

    /* Capitalize each word */
    for (i = 0; i < PyList_GET_SIZE(list); i++) {
        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson's avatar
Benjamin Peterson committed
7159
                     fixcapitalize);
7160 7161 7162 7163 7164 7165 7166 7167 7168
        if (item == NULL)
            goto onError;
        Py_DECREF(PyList_GET_ITEM(list, i));
        PyList_SET_ITEM(list, i, item);
    }

    /* Join the words to form a new string */
    item = PyUnicode_Join(NULL, list);

Benjamin Peterson's avatar
Benjamin Peterson committed
7169
  onError:
7170 7171 7172 7173 7174
    Py_DECREF(list);
    return (PyObject *)item;
}
#endif

7175 7176 7177 7178 7179
/* Argument converter.  Coerces to a single unicode character */

static int
convert_uc(PyObject *obj, void *addr)
{
7180 7181 7182 7183 7184 7185 7186
    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
    PyObject *uniobj;
    Py_UNICODE *unistr;

    uniobj = PyUnicode_FromObject(obj);
    if (uniobj == NULL) {
        PyErr_SetString(PyExc_TypeError,
Benjamin Peterson's avatar
Benjamin Peterson committed
7187
                        "The fill character cannot be converted to Unicode");
7188 7189 7190 7191
        return 0;
    }
    if (PyUnicode_GET_SIZE(uniobj) != 1) {
        PyErr_SetString(PyExc_TypeError,
Benjamin Peterson's avatar
Benjamin Peterson committed
7192
                        "The fill character must be exactly one character long");
7193 7194 7195 7196 7197 7198 7199
        Py_DECREF(uniobj);
        return 0;
    }
    unistr = PyUnicode_AS_UNICODE(uniobj);
    *fillcharloc = unistr[0];
    Py_DECREF(uniobj);
    return 1;
7200 7201
}

7202
PyDoc_STRVAR(center__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7203
             "S.center(width[, fillchar]) -> str\n\
7204
\n\
7205
Return S centered in a string of length width. Padding is\n\
7206
done using the specified fill character (default is a space)");
7207 7208 7209 7210

static PyObject *
unicode_center(PyUnicodeObject *self, PyObject *args)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
7211 7212
    Py_ssize_t marg, left;
    Py_ssize_t width;
7213
    Py_UNICODE fillchar = ' ';
7214

7215
    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
7216 7217
        return NULL;

7218
    if (self->length >= width && PyUnicode_CheckExact(self)) {
7219 7220 7221 7222 7223 7224 7225
        Py_INCREF(self);
        return (PyObject*) self;
    }

    marg = width - self->length;
    left = marg / 2 + (marg & width & 1);

7226
    return (PyObject*) pad(self, left, marg - left, fillchar);
7227 7228
}

7229 7230 7231 7232
#if 0

/* This code should go into some future Unicode collation support
   module. The basic comparison should compare ordinals on a naive
7233
   basis (this is what Java does and thus Jython too). */
7234

Marc-André Lemburg's avatar
Marc-André Lemburg committed
7235 7236 7237 7238
/* speedy UTF-16 code point order comparison */
/* gleaned from: */
/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */

7239
static short utf16Fixup[32] =
Marc-André Lemburg's avatar
Marc-André Lemburg committed
7240 7241
{
    0, 0, 0, 0, 0, 0, 0, 0,
7242 7243
    0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0,
7244
    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg's avatar
Marc-André Lemburg committed
7245 7246
};

7247 7248 7249
static int
unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
7250
    Py_ssize_t len1, len2;
Marc-André Lemburg's avatar
Marc-André Lemburg committed
7251

7252 7253 7254 7255 7256
    Py_UNICODE *s1 = str1->str;
    Py_UNICODE *s2 = str2->str;

    len1 = str1->length;
    len2 = str2->length;
7257

7258
    while (len1 > 0 && len2 > 0) {
7259
        Py_UNICODE c1, c2;
Marc-André Lemburg's avatar
Marc-André Lemburg committed
7260 7261 7262

        c1 = *s1++;
        c2 = *s2++;
7263

Benjamin Peterson's avatar
Benjamin Peterson committed
7264 7265 7266
        if (c1 > (1<<11) * 26)
            c1 += utf16Fixup[c1>>11];
        if (c2 > (1<<11) * 26)
Marc-André Lemburg's avatar
Marc-André Lemburg committed
7267 7268
            c2 += utf16Fixup[c2>>11];
        /* now c1 and c2 are in UTF-32-compatible order */
7269 7270 7271

        if (c1 != c2)
            return (c1 < c2) ? -1 : 1;
7272

Marc-André Lemburg's avatar
Marc-André Lemburg committed
7273
        len1--; len2--;
7274 7275 7276 7277 7278
    }

    return (len1 < len2) ? -1 : (len1 != len2);
}

7279 7280 7281 7282 7283
#else

static int
unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
7284
    register Py_ssize_t len1, len2;
7285 7286 7287 7288 7289 7290

    Py_UNICODE *s1 = str1->str;
    Py_UNICODE *s2 = str2->str;

    len1 = str1->length;
    len2 = str2->length;
7291

7292
    while (len1 > 0 && len2 > 0) {
7293
        Py_UNICODE c1, c2;
7294 7295 7296 7297 7298 7299

        c1 = *s1++;
        c2 = *s2++;

        if (c1 != c2)
            return (c1 < c2) ? -1 : 1;
7300 7301 7302 7303 7304 7305 7306 7307 7308

        len1--; len2--;
    }

    return (len1 < len2) ? -1 : (len1 != len2);
}

#endif

7309
int PyUnicode_Compare(PyObject *left,
Benjamin Peterson's avatar
Benjamin Peterson committed
7310
                      PyObject *right)
7311
{
7312 7313 7314 7315 7316 7317 7318
    if (PyUnicode_Check(left) && PyUnicode_Check(right))
        return unicode_compare((PyUnicodeObject *)left,
                               (PyUnicodeObject *)right);
    PyErr_Format(PyExc_TypeError,
                 "Can't compare %.100s and %.100s",
                 left->ob_type->tp_name,
                 right->ob_type->tp_name);
7319 7320 7321
    return -1;
}

7322 7323 7324 7325 7326 7327 7328 7329 7330
int
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
{
    int i;
    Py_UNICODE *id;
    assert(PyUnicode_Check(uni));
    id = PyUnicode_AS_UNICODE(uni);
    /* Compare Unicode string and source character set string */
    for (i = 0; id[i] && str[i]; i++)
Benjamin Peterson's avatar
Benjamin Peterson committed
7331 7332
        if (id[i] != str[i])
            return ((int)id[i] < (int)str[i]) ? -1 : 1;
7333 7334
    /* This check keeps Python strings that end in '\0' from comparing equal
     to C strings identical up to that point. */
Benjamin Peterson's avatar
Benjamin Peterson committed
7335
    if (PyUnicode_GET_SIZE(uni) != i || id[i])
Benjamin Peterson's avatar
Benjamin Peterson committed
7336
        return 1; /* uni is longer */
7337
    if (str[i])
Benjamin Peterson's avatar
Benjamin Peterson committed
7338
        return -1; /* str is longer */
7339 7340 7341
    return 0;
}

7342

Benjamin Peterson's avatar
Benjamin Peterson committed
7343
#define TEST_COND(cond)                         \
7344
    ((cond) ? Py_True : Py_False)
7345

7346 7347 7348 7349 7350
PyObject *PyUnicode_RichCompare(PyObject *left,
                                PyObject *right,
                                int op)
{
    int result;
7351

7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
        PyObject *v;
        if (((PyUnicodeObject *) left)->length !=
            ((PyUnicodeObject *) right)->length) {
            if (op == Py_EQ) {
                Py_INCREF(Py_False);
                return Py_False;
            }
            if (op == Py_NE) {
                Py_INCREF(Py_True);
                return Py_True;
            }
        }
        if (left == right)
            result = 0;
        else
            result = unicode_compare((PyUnicodeObject *)left,
                                     (PyUnicodeObject *)right);
7370

7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396
        /* Convert the return value to a Boolean */
        switch (op) {
        case Py_EQ:
            v = TEST_COND(result == 0);
            break;
        case Py_NE:
            v = TEST_COND(result != 0);
            break;
        case Py_LE:
            v = TEST_COND(result <= 0);
            break;
        case Py_GE:
            v = TEST_COND(result >= 0);
            break;
        case Py_LT:
            v = TEST_COND(result == -1);
            break;
        case Py_GT:
            v = TEST_COND(result == 1);
            break;
        default:
            PyErr_BadArgument();
            return NULL;
        }
        Py_INCREF(v);
        return v;
7397
    }
7398

7399 7400
    Py_INCREF(Py_NotImplemented);
    return Py_NotImplemented;
7401 7402
}

7403
int PyUnicode_Contains(PyObject *container,
Benjamin Peterson's avatar
Benjamin Peterson committed
7404
                       PyObject *element)
7405
{
7406
    PyObject *str, *sub;
Martin v. Löwis's avatar
Martin v. Löwis committed
7407
    int result;
7408 7409

    /* Coerce the two arguments */
7410 7411
    sub = PyUnicode_FromObject(element);
    if (!sub) {
Benjamin Peterson's avatar
Benjamin Peterson committed
7412 7413 7414
        PyErr_Format(PyExc_TypeError,
                     "'in <string>' requires string as left operand, not %s",
                     element->ob_type->tp_name);
7415
        return -1;
7416
    }
7417

7418 7419 7420 7421
    str = PyUnicode_FromObject(container);
    if (!str) {
        Py_DECREF(sub);
        return -1;
7422 7423
    }

7424
    result = stringlib_contains_obj(str, sub);
7425

7426 7427 7428 7429
    Py_DECREF(str);
    Py_DECREF(sub);

    return result;
7430 7431
}

7432 7433 7434
/* Concat to string or Unicode object giving a new Unicode object. */

PyObject *PyUnicode_Concat(PyObject *left,
Benjamin Peterson's avatar
Benjamin Peterson committed
7435
                           PyObject *right)
7436 7437 7438 7439 7440 7441
{
    PyUnicodeObject *u = NULL, *v = NULL, *w;

    /* Coerce the two arguments */
    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
    if (u == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
7442
        goto onError;
7443 7444
    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
    if (v == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
7445
        goto onError;
7446 7447 7448

    /* Shortcuts */
    if (v == unicode_empty) {
Benjamin Peterson's avatar
Benjamin Peterson committed
7449 7450
        Py_DECREF(v);
        return (PyObject *)u;
7451 7452
    }
    if (u == unicode_empty) {
Benjamin Peterson's avatar
Benjamin Peterson committed
7453 7454
        Py_DECREF(u);
        return (PyObject *)v;
7455 7456 7457 7458 7459
    }

    /* Concat the two Unicode strings */
    w = _PyUnicode_New(u->length + v->length);
    if (w == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
7460
        goto onError;
7461 7462 7463 7464 7465 7466 7467
    Py_UNICODE_COPY(w->str, u->str, u->length);
    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);

    Py_DECREF(u);
    Py_DECREF(v);
    return (PyObject *)w;

Benjamin Peterson's avatar
Benjamin Peterson committed
7468
  onError:
7469 7470 7471 7472 7473
    Py_XDECREF(u);
    Py_XDECREF(v);
    return NULL;
}

7474 7475 7476
void
PyUnicode_Append(PyObject **pleft, PyObject *right)
{
7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487
    PyObject *new;
    if (*pleft == NULL)
        return;
    if (right == NULL || !PyUnicode_Check(*pleft)) {
        Py_DECREF(*pleft);
        *pleft = NULL;
        return;
    }
    new = PyUnicode_Concat(*pleft, right);
    Py_DECREF(*pleft);
    *pleft = new;
7488 7489 7490 7491 7492
}

void
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
{
7493 7494
    PyUnicode_Append(pleft, right);
    Py_XDECREF(right);
7495 7496
}

7497
PyDoc_STRVAR(count__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7498
             "S.count(sub[, start[, end]]) -> int\n\
7499
\n\
7500
Return the number of non-overlapping occurrences of substring sub in\n\
7501
string S[start:end].  Optional arguments start and end are\n\
7502
interpreted as in slice notation.");
7503 7504 7505 7506 7507

static PyObject *
unicode_count(PyUnicodeObject *self, PyObject *args)
{
    PyUnicodeObject *substring;
Martin v. Löwis's avatar
Martin v. Löwis committed
7508
    Py_ssize_t start = 0;
7509
    Py_ssize_t end = PY_SSIZE_T_MAX;
7510 7511
    PyObject *result;

7512 7513
    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
                                            &start, &end))
Benjamin Peterson's avatar
Benjamin Peterson committed
7514
        return NULL;
7515

7516
    ADJUST_INDICES(start, end, self->length);
7517
    result = PyLong_FromSsize_t(
7518
        stringlib_count(self->str + start, end - start,
7519 7520
                        substring->str, substring->length,
                        PY_SSIZE_T_MAX)
7521
        );
7522 7523

    Py_DECREF(substring);
7524

7525 7526 7527
    return result;
}

7528
PyDoc_STRVAR(encode__doc__,
7529
             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
7530
\n\
7531 7532
Encode S using the codec registered for encoding. Default encoding\n\
is 'utf-8'. errors may be given to set a different error\n\
7533
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7534 7535 7536
a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
'xmlcharrefreplace' as well as any other name registered with\n\
codecs.register_error that can handle UnicodeEncodeErrors.");
7537 7538

static PyObject *
7539
unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7540
{
7541
    static char *kwlist[] = {"encoding", "errors", 0};
7542 7543
    char *encoding = NULL;
    char *errors = NULL;
7544

7545 7546
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
                                     kwlist, &encoding, &errors))
7547
        return NULL;
7548
    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7549 7550
}

7551
PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7552
             "S.expandtabs([tabsize]) -> str\n\
7553 7554
\n\
Return a copy of S where all tab characters are expanded using spaces.\n\
7555
If tabsize is not given, a tab size of 8 characters is assumed.");
7556 7557 7558 7559 7560 7561 7562

static PyObject*
unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
{
    Py_UNICODE *e;
    Py_UNICODE *p;
    Py_UNICODE *q;
Christian Heimes's avatar
Christian Heimes committed
7563 7564
    Py_UNICODE *qe;
    Py_ssize_t i, j, incr;
7565 7566 7567 7568
    PyUnicodeObject *u;
    int tabsize = 8;

    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson's avatar
Benjamin Peterson committed
7569
        return NULL;
7570

7571
    /* First pass: determine size of output string */
Christian Heimes's avatar
Christian Heimes committed
7572 7573 7574
    i = 0; /* chars up to and including most recent \n or \r */
    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
    e = self->str + self->length; /* end of input */
7575 7576
    for (p = self->str; p < e; p++)
        if (*p == '\t') {
Benjamin Peterson's avatar
Benjamin Peterson committed
7577 7578 7579 7580 7581
            if (tabsize > 0) {
                incr = tabsize - (j % tabsize); /* cannot overflow */
                if (j > PY_SSIZE_T_MAX - incr)
                    goto overflow1;
                j += incr;
Christian Heimes's avatar
Christian Heimes committed
7582
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
7583
        }
7584
        else {
Benjamin Peterson's avatar
Benjamin Peterson committed
7585 7586
            if (j > PY_SSIZE_T_MAX - 1)
                goto overflow1;
7587 7588
            j++;
            if (*p == '\n' || *p == '\r') {
Benjamin Peterson's avatar
Benjamin Peterson committed
7589 7590
                if (i > PY_SSIZE_T_MAX - j)
                    goto overflow1;
7591
                i += j;
Christian Heimes's avatar
Christian Heimes committed
7592
                j = 0;
7593 7594 7595
            }
        }

Christian Heimes's avatar
Christian Heimes committed
7596
    if (i > PY_SSIZE_T_MAX - j)
Benjamin Peterson's avatar
Benjamin Peterson committed
7597
        goto overflow1;
7598

7599 7600 7601 7602 7603
    /* Second pass: create output string and fill it */
    u = _PyUnicode_New(i + j);
    if (!u)
        return NULL;

Christian Heimes's avatar
Christian Heimes committed
7604 7605 7606
    j = 0; /* same as in first pass */
    q = u->str; /* next output char */
    qe = u->str + u->length; /* end of output */
7607 7608 7609

    for (p = self->str; p < e; p++)
        if (*p == '\t') {
Benjamin Peterson's avatar
Benjamin Peterson committed
7610 7611 7612 7613 7614 7615 7616
            if (tabsize > 0) {
                i = tabsize - (j % tabsize);
                j += i;
                while (i--) {
                    if (q >= qe)
                        goto overflow2;
                    *q++ = ' ';
Christian Heimes's avatar
Christian Heimes committed
7617
                }
Benjamin Peterson's avatar
Benjamin Peterson committed
7618
            }
7619
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
7620 7621 7622 7623
        else {
            if (q >= qe)
                goto overflow2;
            *q++ = *p;
Christian Heimes's avatar
Christian Heimes committed
7624
            j++;
7625 7626 7627 7628 7629
            if (*p == '\n' || *p == '\r')
                j = 0;
        }

    return (PyObject*) u;
Christian Heimes's avatar
Christian Heimes committed
7630 7631 7632 7633 7634 7635

  overflow2:
    Py_DECREF(u);
  overflow1:
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
    return NULL;
7636 7637
}

7638
PyDoc_STRVAR(find__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7639
             "S.find(sub[, start[, end]]) -> int\n\
7640 7641
\n\
Return the lowest index in S where substring sub is found,\n\
7642
such that sub is contained within S[start:end].  Optional\n\
7643 7644
arguments start and end are interpreted as in slice notation.\n\
\n\
7645
Return -1 on failure.");
7646 7647 7648 7649

static PyObject *
unicode_find(PyUnicodeObject *self, PyObject *args)
{
7650
    PyUnicodeObject *substring;
7651 7652
    Py_ssize_t start;
    Py_ssize_t end;
7653
    Py_ssize_t result;
7654

7655 7656
    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
                                            &start, &end))
7657 7658
        return NULL;

7659 7660 7661 7662 7663
    result = stringlib_find_slice(
        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
        start, end
        );
7664 7665

    Py_DECREF(substring);
7666

7667
    return PyLong_FromSsize_t(result);
7668 7669 7670
}

static PyObject *
Martin v. Löwis's avatar
Martin v. Löwis committed
7671
unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7672 7673 7674 7675 7676 7677 7678 7679 7680
{
    if (index < 0 || index >= self->length) {
        PyErr_SetString(PyExc_IndexError, "string index out of range");
        return NULL;
    }

    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
}

7681 7682
/* Believe it or not, this produces the same value for ASCII strings
   as string_hash(). */
7683
static Py_hash_t
7684
unicode_hash(PyUnicodeObject *self)
7685
{
7686 7687
    Py_ssize_t len;
    Py_UNICODE *p;
7688
    Py_hash_t x;
7689

7690
#ifdef Py_DEBUG
7691
    assert(_Py_HashSecret_Initialized);
7692
#endif
7693 7694
    if (self->hash != -1)
        return self->hash;
7695
    len = Py_SIZE(self);
7696 7697 7698 7699 7700 7701 7702 7703
    /*
      We make the hash of the empty string be 0, rather than using
      (prefix ^ suffix), since this slightly obfuscates the hash secret
    */
    if (len == 0) {
        self->hash = 0;
        return 0;
    }
7704
    p = self->str;
7705 7706
    x = _Py_HashSecret.prefix;
    x ^= *p << 7;
7707
    while (--len >= 0)
7708
        x = (_PyHASH_MULTIPLIER*x) ^ *p++;
7709
    x ^= Py_SIZE(self);
7710
    x ^= _Py_HashSecret.suffix;
7711 7712 7713 7714
    if (x == -1)
        x = -2;
    self->hash = x;
    return x;
7715 7716
}

7717
PyDoc_STRVAR(index__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7718
             "S.index(sub[, start[, end]]) -> int\n\
7719
\n\
7720
Like S.find() but raise ValueError when the substring is not found.");
7721 7722 7723 7724

static PyObject *
unicode_index(PyUnicodeObject *self, PyObject *args)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
7725
    Py_ssize_t result;
7726
    PyUnicodeObject *substring;
7727 7728
    Py_ssize_t start;
    Py_ssize_t end;
7729

7730 7731
    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
                                            &start, &end))
7732 7733
        return NULL;

7734 7735 7736 7737 7738
    result = stringlib_find_slice(
        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
        start, end
        );
7739 7740

    Py_DECREF(substring);
7741

7742 7743 7744 7745
    if (result < 0) {
        PyErr_SetString(PyExc_ValueError, "substring not found");
        return NULL;
    }
7746

7747
    return PyLong_FromSsize_t(result);
7748 7749
}

7750
PyDoc_STRVAR(islower__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7751
             "S.islower() -> bool\n\
7752
\n\
7753
Return True if all cased characters in S are lowercase and there is\n\
7754
at least one cased character in S, False otherwise.");
7755 7756

static PyObject*
7757
unicode_islower(PyUnicodeObject *self)
7758 7759 7760 7761 7762 7763 7764
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;
    int cased;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson's avatar
Benjamin Peterson committed
7765
        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7766

7767
    /* Special case for empty strings */
7768
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
7769
        return PyBool_FromLong(0);
7770

7771 7772
    e = p + PyUnicode_GET_SIZE(self);
    cased = 0;
7773 7774
    while (p < e) {
        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7775

Benjamin Peterson's avatar
Benjamin Peterson committed
7776 7777 7778 7779
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
            return PyBool_FromLong(0);
        else if (!cased && Py_UNICODE_ISLOWER(ch))
            cased = 1;
7780
    }
7781
    return PyBool_FromLong(cased);
7782 7783
}

7784
PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7785
             "S.isupper() -> bool\n\
7786
\n\
7787
Return True if all cased characters in S are uppercase and there is\n\
7788
at least one cased character in S, False otherwise.");
7789 7790

static PyObject*
7791
unicode_isupper(PyUnicodeObject *self)
7792 7793 7794 7795 7796 7797 7798
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;
    int cased;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson's avatar
Benjamin Peterson committed
7799
        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7800

7801
    /* Special case for empty strings */
7802
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
7803
        return PyBool_FromLong(0);
7804

7805 7806
    e = p + PyUnicode_GET_SIZE(self);
    cased = 0;
7807 7808
    while (p < e) {
        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7809

Benjamin Peterson's avatar
Benjamin Peterson committed
7810 7811 7812 7813
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
            return PyBool_FromLong(0);
        else if (!cased && Py_UNICODE_ISUPPER(ch))
            cased = 1;
7814
    }
7815
    return PyBool_FromLong(cased);
7816 7817
}

7818
PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7819
             "S.istitle() -> bool\n\
7820
\n\
7821 7822 7823 7824
Return True if S is a titlecased string and there is at least one\n\
character in S, i.e. upper- and titlecase characters may only\n\
follow uncased characters and lowercase characters only cased ones.\n\
Return False otherwise.");
7825 7826

static PyObject*
7827
unicode_istitle(PyUnicodeObject *self)
7828 7829 7830 7831 7832 7833 7834
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;
    int cased, previous_is_cased;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1)
Benjamin Peterson's avatar
Benjamin Peterson committed
7835 7836
        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
                               (Py_UNICODE_ISUPPER(*p) != 0));
7837

7838
    /* Special case for empty strings */
7839
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
7840
        return PyBool_FromLong(0);
7841

7842 7843 7844
    e = p + PyUnicode_GET_SIZE(self);
    cased = 0;
    previous_is_cased = 0;
7845 7846
    while (p < e) {
        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
7847

Benjamin Peterson's avatar
Benjamin Peterson committed
7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
            if (previous_is_cased)
                return PyBool_FromLong(0);
            previous_is_cased = 1;
            cased = 1;
        }
        else if (Py_UNICODE_ISLOWER(ch)) {
            if (!previous_is_cased)
                return PyBool_FromLong(0);
            previous_is_cased = 1;
            cased = 1;
        }
        else
            previous_is_cased = 0;
7862
    }
7863
    return PyBool_FromLong(cased);
7864 7865
}

7866
PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7867
             "S.isspace() -> bool\n\
7868
\n\
7869 7870
Return True if all characters in S are whitespace\n\
and there is at least one character in S, False otherwise.");
7871 7872

static PyObject*
7873
unicode_isspace(PyUnicodeObject *self)
7874 7875 7876 7877 7878 7879
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson's avatar
Benjamin Peterson committed
7880 7881
        Py_UNICODE_ISSPACE(*p))
        return PyBool_FromLong(1);
7882

7883
    /* Special case for empty strings */
7884
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
7885
        return PyBool_FromLong(0);
7886

7887
    e = p + PyUnicode_GET_SIZE(self);
7888 7889 7890
    while (p < e) {
        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
        if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson's avatar
Benjamin Peterson committed
7891
            return PyBool_FromLong(0);
7892
    }
7893
    return PyBool_FromLong(1);
7894 7895
}

7896
PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7897
             "S.isalpha() -> bool\n\
7898
\n\
7899
Return True if all characters in S are alphabetic\n\
7900
and there is at least one character in S, False otherwise.");
7901 7902

static PyObject*
7903
unicode_isalpha(PyUnicodeObject *self)
7904 7905 7906 7907 7908 7909
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson's avatar
Benjamin Peterson committed
7910 7911
        Py_UNICODE_ISALPHA(*p))
        return PyBool_FromLong(1);
7912 7913

    /* Special case for empty strings */
7914
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
7915
        return PyBool_FromLong(0);
7916 7917

    e = p + PyUnicode_GET_SIZE(self);
7918 7919
    while (p < e) {
        if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson's avatar
Benjamin Peterson committed
7920
            return PyBool_FromLong(0);
7921
    }
7922
    return PyBool_FromLong(1);
7923 7924
}

7925
PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7926
             "S.isalnum() -> bool\n\
7927
\n\
7928
Return True if all characters in S are alphanumeric\n\
7929
and there is at least one character in S, False otherwise.");
7930 7931

static PyObject*
7932
unicode_isalnum(PyUnicodeObject *self)
7933 7934 7935 7936 7937 7938
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson's avatar
Benjamin Peterson committed
7939 7940
        Py_UNICODE_ISALNUM(*p))
        return PyBool_FromLong(1);
7941 7942

    /* Special case for empty strings */
7943
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
7944
        return PyBool_FromLong(0);
7945 7946

    e = p + PyUnicode_GET_SIZE(self);
7947 7948 7949
    while (p < e) {
        const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
        if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson's avatar
Benjamin Peterson committed
7950
            return PyBool_FromLong(0);
7951
    }
7952
    return PyBool_FromLong(1);
7953 7954
}

7955
PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7956
             "S.isdecimal() -> bool\n\
7957
\n\
7958
Return True if there are only decimal characters in S,\n\
7959
False otherwise.");
7960 7961

static PyObject*
7962
unicode_isdecimal(PyUnicodeObject *self)
7963 7964 7965 7966 7967 7968
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson's avatar
Benjamin Peterson committed
7969 7970
        Py_UNICODE_ISDECIMAL(*p))
        return PyBool_FromLong(1);
7971

7972
    /* Special case for empty strings */
7973
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
7974
        return PyBool_FromLong(0);
7975

7976
    e = p + PyUnicode_GET_SIZE(self);
7977 7978
    while (p < e) {
        if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson's avatar
Benjamin Peterson committed
7979
            return PyBool_FromLong(0);
7980
    }
7981
    return PyBool_FromLong(1);
7982 7983
}

7984
PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
7985
             "S.isdigit() -> bool\n\
7986
\n\
7987 7988
Return True if all characters in S are digits\n\
and there is at least one character in S, False otherwise.");
7989 7990

static PyObject*
7991
unicode_isdigit(PyUnicodeObject *self)
7992 7993 7994 7995 7996 7997
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson's avatar
Benjamin Peterson committed
7998 7999
        Py_UNICODE_ISDIGIT(*p))
        return PyBool_FromLong(1);
8000

8001
    /* Special case for empty strings */
8002
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
8003
        return PyBool_FromLong(0);
8004

8005
    e = p + PyUnicode_GET_SIZE(self);
8006 8007
    while (p < e) {
        if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson's avatar
Benjamin Peterson committed
8008
            return PyBool_FromLong(0);
8009
    }
8010
    return PyBool_FromLong(1);
8011 8012
}

8013
PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8014
             "S.isnumeric() -> bool\n\
8015
\n\
8016
Return True if there are only numeric characters in S,\n\
8017
False otherwise.");
8018 8019

static PyObject*
8020
unicode_isnumeric(PyUnicodeObject *self)
8021 8022 8023 8024 8025 8026
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 &&
Benjamin Peterson's avatar
Benjamin Peterson committed
8027 8028
        Py_UNICODE_ISNUMERIC(*p))
        return PyBool_FromLong(1);
8029

8030
    /* Special case for empty strings */
8031
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
8032
        return PyBool_FromLong(0);
8033

8034
    e = p + PyUnicode_GET_SIZE(self);
8035 8036
    while (p < e) {
        if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson's avatar
Benjamin Peterson committed
8037
            return PyBool_FromLong(0);
8038
    }
8039
    return PyBool_FromLong(1);
8040 8041
}

8042 8043 8044
int
PyUnicode_IsIdentifier(PyObject *self)
{
8045
    const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8046 8047
    const Py_UNICODE *e;
    Py_UCS4 first;
8048 8049 8050

    /* Special case for empty strings */
    if (PyUnicode_GET_SIZE(self) == 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
8051
        return 0;
8052 8053 8054 8055

    /* PEP 3131 says that the first character must be in
       XID_Start and subsequent characters in XID_Continue,
       and for the ASCII range, the 2.x rules apply (i.e
8056
       start with letters and underscore, continue with
8057 8058 8059 8060
       letters, digits, underscore). However, given the current
       definition of XID_Start and XID_Continue, it is sufficient
       to check just for these, except that _ must be allowed
       as starting an identifier.  */
8061 8062
    e = p + PyUnicode_GET_SIZE(self);
    first = _Py_UNICODE_NEXT(p, e);
8063
    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
8064 8065
        return 0;

8066 8067
    while (p < e)
        if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
Benjamin Peterson's avatar
Benjamin Peterson committed
8068
            return 0;
8069 8070 8071 8072
    return 1;
}

PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8073
             "S.isidentifier() -> bool\n\
8074 8075 8076 8077 8078 8079 8080 8081 8082 8083
\n\
Return True if S is a valid identifier according\n\
to the language definition.");

static PyObject*
unicode_isidentifier(PyObject *self)
{
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
}

Georg Brandl's avatar
Georg Brandl committed
8084
PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8085
             "S.isprintable() -> bool\n\
Georg Brandl's avatar
Georg Brandl committed
8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101
\n\
Return True if all characters in S are considered\n\
printable in repr() or S is empty, False otherwise.");

static PyObject*
unicode_isprintable(PyObject *self)
{
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;

    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
        Py_RETURN_TRUE;
    }

    e = p + PyUnicode_GET_SIZE(self);
8102 8103
    while (p < e) {
        if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
Georg Brandl's avatar
Georg Brandl committed
8104 8105 8106 8107 8108 8109
            Py_RETURN_FALSE;
        }
    }
    Py_RETURN_TRUE;
}

8110
PyDoc_STRVAR(join__doc__,
8111
             "S.join(iterable) -> str\n\
8112 8113
\n\
Return a string which is the concatenation of the strings in the\n\
8114
iterable.  The separator between elements is S.");
8115 8116

static PyObject*
8117
unicode_join(PyObject *self, PyObject *data)
8118
{
8119
    return PyUnicode_Join(self, data);
8120 8121
}

Martin v. Löwis's avatar
Martin v. Löwis committed
8122
static Py_ssize_t
8123 8124 8125 8126 8127
unicode_length(PyUnicodeObject *self)
{
    return self->length;
}

8128
PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8129
             "S.ljust(width[, fillchar]) -> str\n\
8130
\n\
Benjamin Peterson's avatar
Benjamin Peterson committed
8131
Return S left-justified in a Unicode string of length width. Padding is\n\
8132
done using the specified fill character (default is a space).");
8133 8134 8135 8136

static PyObject *
unicode_ljust(PyUnicodeObject *self, PyObject *args)
{
8137
    Py_ssize_t width;
8138 8139
    Py_UNICODE fillchar = ' ';

8140
    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
8141 8142
        return NULL;

8143
    if (self->length >= width && PyUnicode_CheckExact(self)) {
8144 8145 8146 8147
        Py_INCREF(self);
        return (PyObject*) self;
    }

8148
    return (PyObject*) pad(self, 0, width - self->length, fillchar);
8149 8150
}

8151
PyDoc_STRVAR(lower__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8152
             "S.lower() -> str\n\
8153
\n\
8154
Return a copy of the string S converted to lowercase.");
8155 8156

static PyObject*
8157
unicode_lower(PyUnicodeObject *self)
8158 8159 8160 8161
{
    return fixup(self, fixlower);
}

8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174
#define LEFTSTRIP 0
#define RIGHTSTRIP 1
#define BOTHSTRIP 2

/* Arrays indexed by above */
static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};

#define STRIPNAME(i) (stripformat[i]+3)

/* externally visible for str.strip(unicode) */
PyObject *
_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
{
8175 8176 8177 8178 8179
    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
    Py_ssize_t len = PyUnicode_GET_SIZE(self);
    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
    Py_ssize_t i, j;
8180

Benjamin Peterson's avatar
Benjamin Peterson committed
8181
    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
8182

8183 8184
    i = 0;
    if (striptype != RIGHTSTRIP) {
Benjamin Peterson's avatar
Benjamin Peterson committed
8185 8186 8187
        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
            i++;
        }
8188
    }
8189

8190 8191
    j = len;
    if (striptype != LEFTSTRIP) {
Benjamin Peterson's avatar
Benjamin Peterson committed
8192 8193 8194 8195
        do {
            j--;
        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
        j++;
8196
    }
8197

8198
    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
8199 8200
        Py_INCREF(self);
        return (PyObject*)self;
8201 8202
    }
    else
Benjamin Peterson's avatar
Benjamin Peterson committed
8203
        return PyUnicode_FromUnicode(s+i, j-i);
8204 8205 8206 8207 8208 8209
}


static PyObject *
do_strip(PyUnicodeObject *self, int striptype)
{
8210 8211
    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
8212

8213 8214 8215 8216 8217 8218
    i = 0;
    if (striptype != RIGHTSTRIP) {
        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
            i++;
        }
    }
8219

8220 8221 8222 8223 8224 8225 8226
    j = len;
    if (striptype != LEFTSTRIP) {
        do {
            j--;
        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
        j++;
    }
8227

8228 8229 8230 8231 8232 8233
    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
        Py_INCREF(self);
        return (PyObject*)self;
    }
    else
        return PyUnicode_FromUnicode(s+i, j-i);
8234 8235 8236 8237 8238 8239
}


static PyObject *
do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
{
8240
    PyObject *sep = NULL;
8241

8242 8243
    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
        return NULL;
8244

8245 8246 8247 8248 8249
    if (sep != NULL && sep != Py_None) {
        if (PyUnicode_Check(sep))
            return _PyUnicode_XStrip(self, striptype, sep);
        else {
            PyErr_Format(PyExc_TypeError,
Benjamin Peterson's avatar
Benjamin Peterson committed
8250 8251
                         "%s arg must be None or str",
                         STRIPNAME(striptype));
8252 8253 8254
            return NULL;
        }
    }
8255

8256
    return do_strip(self, striptype);
8257 8258 8259
}


8260
PyDoc_STRVAR(strip__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8261
             "S.strip([chars]) -> str\n\
8262 8263 8264
\n\
Return a copy of the string S with leading and trailing\n\
whitespace removed.\n\
8265
If chars is given and not None, remove characters in chars instead.");
8266 8267 8268 8269

static PyObject *
unicode_strip(PyUnicodeObject *self, PyObject *args)
{
8270 8271 8272 8273
    if (PyTuple_GET_SIZE(args) == 0)
        return do_strip(self, BOTHSTRIP); /* Common case */
    else
        return do_argstrip(self, BOTHSTRIP, args);
8274 8275 8276
}


8277
PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8278
             "S.lstrip([chars]) -> str\n\
8279 8280
\n\
Return a copy of the string S with leading whitespace removed.\n\
8281
If chars is given and not None, remove characters in chars instead.");
8282 8283 8284 8285

static PyObject *
unicode_lstrip(PyUnicodeObject *self, PyObject *args)
{
8286 8287 8288 8289
    if (PyTuple_GET_SIZE(args) == 0)
        return do_strip(self, LEFTSTRIP); /* Common case */
    else
        return do_argstrip(self, LEFTSTRIP, args);
8290 8291 8292
}


8293
PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8294
             "S.rstrip([chars]) -> str\n\
8295
\n\
8296
Return a copy of the string S with trailing whitespace removed.\n\
8297
If chars is given and not None, remove characters in chars instead.");
8298 8299

static PyObject *
8300
unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8301
{
8302 8303 8304 8305
    if (PyTuple_GET_SIZE(args) == 0)
        return do_strip(self, RIGHTSTRIP); /* Common case */
    else
        return do_argstrip(self, RIGHTSTRIP, args);
8306 8307
}

8308

8309
static PyObject*
Martin v. Löwis's avatar
Martin v. Löwis committed
8310
unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
8311 8312 8313
{
    PyUnicodeObject *u;
    Py_UNICODE *p;
Martin v. Löwis's avatar
Martin v. Löwis committed
8314
    Py_ssize_t nchars;
8315
    size_t nbytes;
8316

8317 8318 8319 8320
    if (len < 1) {
        Py_INCREF(unicode_empty);
        return (PyObject *)unicode_empty;
    }
8321

8322
    if (len == 1 && PyUnicode_CheckExact(str)) {
8323 8324 8325 8326
        /* no repeat, return original string */
        Py_INCREF(str);
        return (PyObject*) str;
    }
8327 8328 8329 8330 8331

    /* ensure # of chars needed doesn't overflow int and # of bytes
     * needed doesn't overflow size_t
     */
    nchars = len * str->length;
8332
    if (nchars / len != str->length) {
8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343
        PyErr_SetString(PyExc_OverflowError,
                        "repeated string is too long");
        return NULL;
    }
    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
        PyErr_SetString(PyExc_OverflowError,
                        "repeated string is too long");
        return NULL;
    }
    u = _PyUnicode_New(nchars);
8344 8345 8346 8347 8348
    if (!u)
        return NULL;

    p = u->str;

8349
    if (str->length == 1) {
8350 8351
        Py_UNICODE_FILL(p, str->str[0], len);
    } else {
8352 8353
        Py_ssize_t done = str->length; /* number of characters copied this far */
        Py_UNICODE_COPY(p, str->str, str->length);
Benjamin Peterson's avatar
Benjamin Peterson committed
8354
        while (done < nchars) {
Christian Heimes's avatar
Christian Heimes committed
8355
            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
8356 8357
            Py_UNICODE_COPY(p+done, p, n);
            done += n;
Benjamin Peterson's avatar
Benjamin Peterson committed
8358
        }
8359 8360 8361 8362 8363 8364
    }

    return (PyObject*) u;
}

PyObject *PyUnicode_Replace(PyObject *obj,
Benjamin Peterson's avatar
Benjamin Peterson committed
8365 8366 8367
                            PyObject *subobj,
                            PyObject *replobj,
                            Py_ssize_t maxcount)
8368 8369 8370 8371 8372 8373 8374 8375
{
    PyObject *self;
    PyObject *str1;
    PyObject *str2;
    PyObject *result;

    self = PyUnicode_FromObject(obj);
    if (self == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
8376
        return NULL;
8377 8378
    str1 = PyUnicode_FromObject(subobj);
    if (str1 == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
8379 8380
        Py_DECREF(self);
        return NULL;
8381 8382 8383
    }
    str2 = PyUnicode_FromObject(replobj);
    if (str2 == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
8384 8385 8386
        Py_DECREF(self);
        Py_DECREF(str1);
        return NULL;
8387
    }
8388
    result = replace((PyUnicodeObject *)self,
Benjamin Peterson's avatar
Benjamin Peterson committed
8389 8390 8391
                     (PyUnicodeObject *)str1,
                     (PyUnicodeObject *)str2,
                     maxcount);
8392 8393 8394 8395 8396 8397
    Py_DECREF(self);
    Py_DECREF(str1);
    Py_DECREF(str2);
    return result;
}

8398
PyDoc_STRVAR(replace__doc__,
8399
             "S.replace(old, new[, count]) -> str\n\
8400 8401
\n\
Return a copy of S with all occurrences of substring\n\
Georg Brandl's avatar
Georg Brandl committed
8402 8403
old replaced by new.  If the optional argument count is\n\
given, only the first count occurrences are replaced.");
8404 8405 8406 8407 8408 8409

static PyObject*
unicode_replace(PyUnicodeObject *self, PyObject *args)
{
    PyUnicodeObject *str1;
    PyUnicodeObject *str2;
Martin v. Löwis's avatar
Martin v. Löwis committed
8410
    Py_ssize_t maxcount = -1;
8411 8412
    PyObject *result;

Martin v. Löwis's avatar
Martin v. Löwis committed
8413
    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8414 8415 8416
        return NULL;
    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
    if (str1 == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
8417
        return NULL;
8418
    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwald's avatar
Walter Dörwald committed
8419
    if (str2 == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
8420 8421
        Py_DECREF(str1);
        return NULL;
Walter Dörwald's avatar
Walter Dörwald committed
8422
    }
8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433

    result = replace(self, str1, str2, maxcount);

    Py_DECREF(str1);
    Py_DECREF(str2);
    return result;
}

static
PyObject *unicode_repr(PyObject *unicode)
{
8434
    PyObject *repr;
8435
    Py_UNICODE *p;
8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456
    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);

    /* XXX(nnorwitz): rather than over-allocating, it would be
       better to choose a different scheme.  Perhaps scan the
       first N-chars of the string and allocate based on that size.
    */
    /* Initial allocation is based on the longest-possible unichr
       escape.

       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
       unichr, so in this case it's the longest unichr escape. In
       narrow (UTF-16) builds this is five chars per source unichr
       since there are two unichrs in the surrogate pair, so in narrow
       (UTF-16) builds it's not the longest unichr escape.

       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
       so in the narrow (UTF-16) build case it's the longest unichr
       escape.
    */

8457
    repr = PyUnicode_FromUnicode(NULL,
Benjamin Peterson's avatar
Benjamin Peterson committed
8458
                                 2 /* quotes */
8459
#ifdef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
8460
                                 + 10*size
8461
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
8462
                                 + 6*size
8463
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
8464
                                 + 1);
8465 8466 8467
    if (repr == NULL)
        return NULL;

8468
    p = PyUnicode_AS_UNICODE(repr);
8469 8470 8471 8472 8473 8474 8475 8476

    /* Add quote */
    *p++ = (findchar(s, size, '\'') &&
            !findchar(s, size, '"')) ? '"' : '\'';
    while (size-- > 0) {
        Py_UNICODE ch = *s++;

        /* Escape quotes and backslashes */
8477
        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8478
            *p++ = '\\';
8479
            *p++ = ch;
8480 8481 8482
            continue;
        }

Benjamin Peterson's avatar
Benjamin Peterson committed
8483
        /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl's avatar
Georg Brandl committed
8484
        if (ch == '\t') {
8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497
            *p++ = '\\';
            *p++ = 't';
        }
        else if (ch == '\n') {
            *p++ = '\\';
            *p++ = 'n';
        }
        else if (ch == '\r') {
            *p++ = '\\';
            *p++ = 'r';
        }

        /* Map non-printable US ASCII to '\xhh' */
Georg Brandl's avatar
Georg Brandl committed
8498
        else if (ch < ' ' || ch == 0x7F) {
8499 8500 8501 8502 8503 8504
            *p++ = '\\';
            *p++ = 'x';
            *p++ = hexdigits[(ch >> 4) & 0x000F];
            *p++ = hexdigits[ch & 0x000F];
        }

Georg Brandl's avatar
Georg Brandl committed
8505 8506 8507 8508 8509
        /* Copy ASCII characters as-is */
        else if (ch < 0x7F) {
            *p++ = ch;
        }

Benjamin Peterson's avatar
Benjamin Peterson committed
8510
        /* Non-ASCII characters */
Georg Brandl's avatar
Georg Brandl committed
8511 8512 8513 8514 8515 8516 8517 8518 8519
        else {
            Py_UCS4 ucs = ch;

#ifndef Py_UNICODE_WIDE
            Py_UNICODE ch2 = 0;
            /* Get code point from surrogate pair */
            if (size > 0) {
                ch2 = *s;
                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
Benjamin Peterson's avatar
Benjamin Peterson committed
8520
                    && ch2 <= 0xDFFF) {
8521
                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
Benjamin Peterson's avatar
Benjamin Peterson committed
8522
                        + 0x00010000;
8523
                    s++;
Georg Brandl's avatar
Georg Brandl committed
8524 8525 8526 8527
                    size--;
                }
            }
#endif
8528
            /* Map Unicode whitespace and control characters
Georg Brandl's avatar
Georg Brandl committed
8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570
               (categories Z* and C* except ASCII space)
            */
            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
                /* Map 8-bit characters to '\xhh' */
                if (ucs <= 0xff) {
                    *p++ = '\\';
                    *p++ = 'x';
                    *p++ = hexdigits[(ch >> 4) & 0x000F];
                    *p++ = hexdigits[ch & 0x000F];
                }
                /* Map 21-bit characters to '\U00xxxxxx' */
                else if (ucs >= 0x10000) {
                    *p++ = '\\';
                    *p++ = 'U';
                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
                    *p++ = hexdigits[ucs & 0x0000000F];
                }
                /* Map 16-bit characters to '\uxxxx' */
                else {
                    *p++ = '\\';
                    *p++ = 'u';
                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
                    *p++ = hexdigits[ucs & 0x000F];
                }
            }
            /* Copy characters as-is */
            else {
                *p++ = ch;
#ifndef Py_UNICODE_WIDE
                if (ucs >= 0x10000)
                    *p++ = ch2;
#endif
            }
        }
8571 8572
    }
    /* Add quote */
8573
    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8574 8575

    *p = '\0';
8576
    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8577
    return repr;
8578 8579
}

8580
PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8581
             "S.rfind(sub[, start[, end]]) -> int\n\
8582 8583
\n\
Return the highest index in S where substring sub is found,\n\
8584
such that sub is contained within S[start:end].  Optional\n\
8585 8586
arguments start and end are interpreted as in slice notation.\n\
\n\
8587
Return -1 on failure.");
8588 8589 8590 8591

static PyObject *
unicode_rfind(PyUnicodeObject *self, PyObject *args)
{
8592
    PyUnicodeObject *substring;
8593 8594
    Py_ssize_t start;
    Py_ssize_t end;
8595
    Py_ssize_t result;
8596

8597 8598
    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
                                            &start, &end))
8599
        return NULL;
8600

8601 8602 8603 8604 8605
    result = stringlib_rfind_slice(
        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
        start, end
        );
8606 8607

    Py_DECREF(substring);
8608

8609
    return PyLong_FromSsize_t(result);
8610 8611
}

8612
PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8613
             "S.rindex(sub[, start[, end]]) -> int\n\
8614
\n\
8615
Like S.rfind() but raise ValueError when the substring is not found.");
8616 8617 8618 8619

static PyObject *
unicode_rindex(PyUnicodeObject *self, PyObject *args)
{
8620
    PyUnicodeObject *substring;
8621 8622
    Py_ssize_t start;
    Py_ssize_t end;
8623
    Py_ssize_t result;
8624

8625 8626
    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
                                            &start, &end))
8627
        return NULL;
8628

8629 8630 8631 8632 8633
    result = stringlib_rfind_slice(
        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
        start, end
        );
8634 8635

    Py_DECREF(substring);
8636

8637 8638 8639 8640
    if (result < 0) {
        PyErr_SetString(PyExc_ValueError, "substring not found");
        return NULL;
    }
8641
    return PyLong_FromSsize_t(result);
8642 8643
}

8644
PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8645
             "S.rjust(width[, fillchar]) -> str\n\
8646
\n\
Benjamin Peterson's avatar
Benjamin Peterson committed
8647
Return S right-justified in a string of length width. Padding is\n\
8648
done using the specified fill character (default is a space).");
8649 8650 8651 8652

static PyObject *
unicode_rjust(PyUnicodeObject *self, PyObject *args)
{
8653
    Py_ssize_t width;
8654 8655
    Py_UNICODE fillchar = ' ';

8656
    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8657 8658
        return NULL;

8659
    if (self->length >= width && PyUnicode_CheckExact(self)) {
8660 8661 8662 8663
        Py_INCREF(self);
        return (PyObject*) self;
    }

8664
    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8665 8666 8667
}

PyObject *PyUnicode_Split(PyObject *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
8668 8669
                          PyObject *sep,
                          Py_ssize_t maxsplit)
8670 8671
{
    PyObject *result;
8672

8673 8674
    s = PyUnicode_FromObject(s);
    if (s == NULL)
8675
        return NULL;
Benjamin Peterson's avatar
Benjamin Peterson committed
8676 8677 8678 8679 8680 8681
    if (sep != NULL) {
        sep = PyUnicode_FromObject(sep);
        if (sep == NULL) {
            Py_DECREF(s);
            return NULL;
        }
8682 8683 8684 8685 8686 8687 8688 8689 8690
    }

    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);

    Py_DECREF(s);
    Py_XDECREF(sep);
    return result;
}

8691
PyDoc_STRVAR(split__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8692
             "S.split([sep[, maxsplit]]) -> list of strings\n\
8693 8694 8695
\n\
Return a list of the words in S, using sep as the\n\
delimiter string.  If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti's avatar
Alexandre Vassalotti committed
8696
splits are done. If sep is not specified or is None, any\n\
8697 8698
whitespace string is a separator and empty strings are\n\
removed from the result.");
8699 8700 8701 8702 8703

static PyObject*
unicode_split(PyUnicodeObject *self, PyObject *args)
{
    PyObject *substring = Py_None;
Martin v. Löwis's avatar
Martin v. Löwis committed
8704
    Py_ssize_t maxcount = -1;
8705

Martin v. Löwis's avatar
Martin v. Löwis committed
8706
    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8707 8708 8709
        return NULL;

    if (substring == Py_None)
Benjamin Peterson's avatar
Benjamin Peterson committed
8710
        return split(self, NULL, maxcount);
8711
    else if (PyUnicode_Check(substring))
Benjamin Peterson's avatar
Benjamin Peterson committed
8712
        return split(self, (PyUnicodeObject *)substring, maxcount);
8713
    else
Benjamin Peterson's avatar
Benjamin Peterson committed
8714
        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8715 8716
}

8717 8718 8719 8720 8721 8722 8723 8724 8725
PyObject *
PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
{
    PyObject* str_obj;
    PyObject* sep_obj;
    PyObject* out;

    str_obj = PyUnicode_FromObject(str_in);
    if (!str_obj)
Benjamin Peterson's avatar
Benjamin Peterson committed
8726
        return NULL;
8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753
    sep_obj = PyUnicode_FromObject(sep_in);
    if (!sep_obj) {
        Py_DECREF(str_obj);
        return NULL;
    }

    out = stringlib_partition(
        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
        );

    Py_DECREF(sep_obj);
    Py_DECREF(str_obj);

    return out;
}


PyObject *
PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
{
    PyObject* str_obj;
    PyObject* sep_obj;
    PyObject* out;

    str_obj = PyUnicode_FromObject(str_in);
    if (!str_obj)
Benjamin Peterson's avatar
Benjamin Peterson committed
8754
        return NULL;
8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772
    sep_obj = PyUnicode_FromObject(sep_in);
    if (!sep_obj) {
        Py_DECREF(str_obj);
        return NULL;
    }

    out = stringlib_rpartition(
        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
        );

    Py_DECREF(sep_obj);
    Py_DECREF(str_obj);

    return out;
}

PyDoc_STRVAR(partition__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8773
             "S.partition(sep) -> (head, sep, tail)\n\
8774
\n\
8775
Search for the separator sep in S, and return the part before it,\n\
8776
the separator itself, and the part after it.  If the separator is not\n\
Benjamin Peterson's avatar
Benjamin Peterson committed
8777
found, return S and two empty strings.");
8778 8779 8780 8781 8782 8783 8784 8785

static PyObject*
unicode_partition(PyUnicodeObject *self, PyObject *separator)
{
    return PyUnicode_Partition((PyObject *)self, separator);
}

PyDoc_STRVAR(rpartition__doc__,
8786
             "S.rpartition(sep) -> (head, sep, tail)\n\
8787
\n\
8788
Search for the separator sep in S, starting at the end of S, and return\n\
8789
the part before it, the separator itself, and the part after it.  If the\n\
Benjamin Peterson's avatar
Benjamin Peterson committed
8790
separator is not found, return two empty strings and S.");
8791 8792 8793 8794 8795 8796 8797

static PyObject*
unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
{
    return PyUnicode_RPartition((PyObject *)self, separator);
}

8798
PyObject *PyUnicode_RSplit(PyObject *s,
Benjamin Peterson's avatar
Benjamin Peterson committed
8799 8800
                           PyObject *sep,
                           Py_ssize_t maxsplit)
8801 8802
{
    PyObject *result;
8803

8804 8805
    s = PyUnicode_FromObject(s);
    if (s == NULL)
8806
        return NULL;
Benjamin Peterson's avatar
Benjamin Peterson committed
8807 8808 8809 8810 8811 8812
    if (sep != NULL) {
        sep = PyUnicode_FromObject(sep);
        if (sep == NULL) {
            Py_DECREF(s);
            return NULL;
        }
8813 8814 8815 8816 8817 8818 8819 8820 8821 8822
    }

    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);

    Py_DECREF(s);
    Py_XDECREF(sep);
    return result;
}

PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8823
             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834
\n\
Return a list of the words in S, using sep as the\n\
delimiter string, starting at the end of the string and\n\
working to the front.  If maxsplit is given, at most maxsplit\n\
splits are done. If sep is not specified, any whitespace string\n\
is a separator.");

static PyObject*
unicode_rsplit(PyUnicodeObject *self, PyObject *args)
{
    PyObject *substring = Py_None;
Martin v. Löwis's avatar
Martin v. Löwis committed
8835
    Py_ssize_t maxcount = -1;
8836

Martin v. Löwis's avatar
Martin v. Löwis committed
8837
    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8838 8839 8840
        return NULL;

    if (substring == Py_None)
Benjamin Peterson's avatar
Benjamin Peterson committed
8841
        return rsplit(self, NULL, maxcount);
8842
    else if (PyUnicode_Check(substring))
Benjamin Peterson's avatar
Benjamin Peterson committed
8843
        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8844
    else
Benjamin Peterson's avatar
Benjamin Peterson committed
8845
        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8846 8847
}

8848
PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8849
             "S.splitlines([keepends]) -> list of strings\n\
8850 8851
\n\
Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum's avatar
Guido van Rossum committed
8852
Line breaks are not included in the resulting list unless keepends\n\
8853
is given and true.");
8854 8855 8856 8857

static PyObject*
unicode_splitlines(PyUnicodeObject *self, PyObject *args)
{
Guido van Rossum's avatar
Guido van Rossum committed
8858
    int keepends = 0;
8859

Guido van Rossum's avatar
Guido van Rossum committed
8860
    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8861 8862
        return NULL;

Guido van Rossum's avatar
Guido van Rossum committed
8863
    return PyUnicode_Splitlines((PyObject *)self, keepends);
8864 8865 8866
}

static
8867
PyObject *unicode_str(PyObject *self)
8868
{
8869 8870 8871 8872 8873 8874 8875
    if (PyUnicode_CheckExact(self)) {
        Py_INCREF(self);
        return self;
    } else
        /* Subtype -- return genuine unicode string with the same value. */
        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
                                     PyUnicode_GET_SIZE(self));
8876 8877
}

8878
PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8879
             "S.swapcase() -> str\n\
8880 8881
\n\
Return a copy of S with uppercase characters converted to lowercase\n\
8882
and vice versa.");
8883 8884

static PyObject*
8885
unicode_swapcase(PyUnicodeObject *self)
8886 8887 8888 8889
{
    return fixup(self, fixswapcase);
}

8890
PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8891
             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8892
\n\
8893 8894 8895
Return a translation table usable for str.translate().\n\
If there is only one argument, it must be a dictionary mapping Unicode\n\
ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8896
Character keys will be then converted to ordinals.\n\
8897 8898 8899 8900
If there are two arguments, they must be strings of equal length, and\n\
in the resulting dictionary, each character in x will be mapped to the\n\
character at the same position in y. If there is a third argument, it\n\
must be a string, whose characters will be mapped to None in the result.");
8901 8902

static PyObject*
8903
unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8904
{
8905 8906
    PyObject *x, *y = NULL, *z = NULL;
    PyObject *new = NULL, *key, *value;
8907
    Py_ssize_t i = 0;
8908
    int res;
8909

8910
    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8911
        return NULL;
8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929
    new = PyDict_New();
    if (!new)
        return NULL;
    if (y != NULL) {
        /* x must be a string too, of equal length */
        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
        if (!PyUnicode_Check(x)) {
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
                            "be a string if there is a second argument");
            goto err;
        }
        if (PyUnicode_GET_SIZE(x) != ylen) {
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
                            "arguments must have equal length");
            goto err;
        }
        /* create entries for translating chars in x to those in y */
        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8930
            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8931 8932
            if (!key)
                goto err;
8933
            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8934 8935
            if (!value) {
                Py_DECREF(key);
8936
                goto err;
8937
            }
8938 8939 8940
            res = PyDict_SetItem(new, key, value);
            Py_DECREF(key);
            Py_DECREF(value);
8941 8942
            if (res < 0)
                goto err;
8943 8944 8945 8946
        }
        /* create entries for deleting chars in z */
        if (z != NULL) {
            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8947
                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8948 8949 8950 8951 8952 8953 8954 8955 8956 8957
                if (!key)
                    goto err;
                res = PyDict_SetItem(new, key, Py_None);
                Py_DECREF(key);
                if (res < 0)
                    goto err;
            }
        }
    } else {
        /* x must be a dict */
8958
        if (!PyDict_CheckExact(x)) {
8959 8960
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
                            "to maketrans it must be a dict");
8961 8962
            goto err;
        }
8963 8964 8965 8966 8967 8968 8969 8970 8971 8972
        /* copy entries into the new dict, converting string keys to int keys */
        while (PyDict_Next(x, &i, &key, &value)) {
            if (PyUnicode_Check(key)) {
                /* convert string keys to integer keys */
                PyObject *newkey;
                if (PyUnicode_GET_SIZE(key) != 1) {
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
                                    "table must be of length 1");
                    goto err;
                }
8973
                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8974 8975 8976 8977 8978 8979
                if (!newkey)
                    goto err;
                res = PyDict_SetItem(new, newkey, value);
                Py_DECREF(newkey);
                if (res < 0)
                    goto err;
8980
            } else if (PyLong_Check(key)) {
8981 8982 8983 8984 8985 8986 8987 8988 8989
                /* just keep integer keys */
                if (PyDict_SetItem(new, key, value) < 0)
                    goto err;
            } else {
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
                                "be strings or integers");
                goto err;
            }
        }
8990
    }
8991
    return new;
8992
  err:
8993
    Py_DECREF(new);
8994
    return NULL;
8995 8996
}

8997
PyDoc_STRVAR(translate__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
8998
             "S.translate(table) -> str\n\
8999 9000 9001
\n\
Return a copy of the string S, where all characters have been mapped\n\
through the given translation table, which must be a mapping of\n\
9002
Unicode ordinals to Unicode ordinals, strings, or None.\n\
9003 9004 9005 9006 9007 9008 9009 9010 9011
Unmapped characters are left untouched. Characters mapped to None\n\
are deleted.");

static PyObject*
unicode_translate(PyUnicodeObject *self, PyObject *table)
{
    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
}

9012
PyDoc_STRVAR(upper__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
9013
             "S.upper() -> str\n\
9014
\n\
9015
Return a copy of S converted to uppercase.");
9016 9017

static PyObject*
9018
unicode_upper(PyUnicodeObject *self)
9019 9020 9021 9022
{
    return fixup(self, fixupper);
}

9023
PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
9024
             "S.zfill(width) -> str\n\
9025
\n\
9026 9027
Pad a numeric string S with zeros on the left, to fill a field\n\
of the specified width. The string S is never truncated.");
9028 9029 9030 9031

static PyObject *
unicode_zfill(PyUnicodeObject *self, PyObject *args)
{
Martin v. Löwis's avatar
Martin v. Löwis committed
9032
    Py_ssize_t fill;
9033 9034
    PyUnicodeObject *u;

Martin v. Löwis's avatar
Martin v. Löwis committed
9035 9036
    Py_ssize_t width;
    if (!PyArg_ParseTuple(args, "n:zfill", &width))
9037 9038 9039
        return NULL;

    if (self->length >= width) {
9040 9041 9042 9043 9044 9045 9046 9047
        if (PyUnicode_CheckExact(self)) {
            Py_INCREF(self);
            return (PyObject*) self;
        }
        else
            return PyUnicode_FromUnicode(
                PyUnicode_AS_UNICODE(self),
                PyUnicode_GET_SIZE(self)
Benjamin Peterson's avatar
Benjamin Peterson committed
9048
                );
9049 9050 9051 9052 9053 9054
    }

    fill = width - self->length;

    u = pad(self, fill, 0, '0');

9055 9056 9057
    if (u == NULL)
        return NULL;

9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068
    if (u->str[fill] == '+' || u->str[fill] == '-') {
        /* move sign to beginning of string */
        u->str[0] = u->str[fill];
        u->str[fill] = '0';
    }

    return (PyObject*) u;
}

#if 0
static PyObject*
9069
unicode_freelistsize(PyUnicodeObject *self)
9070
{
Christian Heimes's avatar
Christian Heimes committed
9071
    return PyLong_FromLong(numfree);
9072
}
9073 9074 9075 9076 9077 9078 9079

static PyObject *
unicode__decimal2ascii(PyObject *self)
{
    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
                                             PyUnicode_GET_SIZE(self));
}
9080 9081
#endif

9082
PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
9083
             "S.startswith(prefix[, start[, end]]) -> bool\n\
9084
\n\
9085 9086
Return True if S starts with the specified prefix, False otherwise.\n\
With optional start, test S beginning at that position.\n\
9087 9088
With optional end, stop comparing S at that position.\n\
prefix can also be a tuple of strings to try.");
9089 9090 9091

static PyObject *
unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
9092
                   PyObject *args)
9093
{
9094
    PyObject *subobj;
9095
    PyUnicodeObject *substring;
Martin v. Löwis's avatar
Martin v. Löwis committed
9096
    Py_ssize_t start = 0;
9097
    Py_ssize_t end = PY_SSIZE_T_MAX;
9098
    int result;
9099

9100
    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson's avatar
Benjamin Peterson committed
9101
        return NULL;
9102 9103 9104 9105
    if (PyTuple_Check(subobj)) {
        Py_ssize_t i;
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
            substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson's avatar
Benjamin Peterson committed
9106
                PyTuple_GET_ITEM(subobj, i));
9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118
            if (substring == NULL)
                return NULL;
            result = tailmatch(self, substring, start, end, -1);
            Py_DECREF(substring);
            if (result) {
                Py_RETURN_TRUE;
            }
        }
        /* nothing matched */
        Py_RETURN_FALSE;
    }
    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9119 9120 9121 9122
    if (substring == NULL) {
        if (PyErr_ExceptionMatches(PyExc_TypeError))
            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson's avatar
Benjamin Peterson committed
9123
        return NULL;
9124
    }
9125
    result = tailmatch(self, substring, start, end, -1);
9126
    Py_DECREF(substring);
9127
    return PyBool_FromLong(result);
9128 9129 9130
}


9131
PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
9132
             "S.endswith(suffix[, start[, end]]) -> bool\n\
9133
\n\
9134 9135
Return True if S ends with the specified suffix, False otherwise.\n\
With optional start, test S beginning at that position.\n\
9136 9137
With optional end, stop comparing S at that position.\n\
suffix can also be a tuple of strings to try.");
9138 9139 9140

static PyObject *
unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson's avatar
Benjamin Peterson committed
9141
                 PyObject *args)
9142
{
9143
    PyObject *subobj;
9144
    PyUnicodeObject *substring;
Martin v. Löwis's avatar
Martin v. Löwis committed
9145
    Py_ssize_t start = 0;
9146
    Py_ssize_t end = PY_SSIZE_T_MAX;
9147
    int result;
9148

9149
    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson's avatar
Benjamin Peterson committed
9150
        return NULL;
9151 9152 9153 9154
    if (PyTuple_Check(subobj)) {
        Py_ssize_t i;
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
            substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson's avatar
Benjamin Peterson committed
9155
                PyTuple_GET_ITEM(subobj, i));
9156
            if (substring == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
9157
                return NULL;
9158 9159 9160 9161 9162 9163 9164 9165 9166
            result = tailmatch(self, substring, start, end, +1);
            Py_DECREF(substring);
            if (result) {
                Py_RETURN_TRUE;
            }
        }
        Py_RETURN_FALSE;
    }
    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9167 9168 9169 9170
    if (substring == NULL) {
        if (PyErr_ExceptionMatches(PyExc_TypeError))
            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson's avatar
Benjamin Peterson committed
9171
        return NULL;
9172
    }
9173
    result = tailmatch(self, substring, start, end, +1);
9174
    Py_DECREF(substring);
9175
    return PyBool_FromLong(result);
9176 9177
}

9178 9179 9180
#include "stringlib/string_format.h"

PyDoc_STRVAR(format__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
9181
             "S.format(*args, **kwargs) -> str\n\
9182
\n\
9183 9184
Return a formatted version of S, using substitutions from args and kwargs.\n\
The substitutions are identified by braces ('{' and '}').");
9185

9186 9187 9188
PyDoc_STRVAR(format_map__doc__,
             "S.format_map(mapping) -> str\n\
\n\
9189 9190
Return a formatted version of S, using substitutions from mapping.\n\
The substitutions are identified by braces ('{' and '}').");
9191

9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204
static PyObject *
unicode__format__(PyObject* self, PyObject* args)
{
    PyObject *format_spec;

    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
        return NULL;

    return _PyUnicode_FormatAdvanced(self,
                                     PyUnicode_AS_UNICODE(format_spec),
                                     PyUnicode_GET_SIZE(format_spec));
}

9205
PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
9206
             "S.__format__(format_spec) -> str\n\
9207
\n\
9208
Return a formatted version of S as described by format_spec.");
9209

Georg Brandl's avatar
Georg Brandl committed
9210 9211 9212
static PyObject *
unicode__sizeof__(PyUnicodeObject *v)
{
9213 9214
    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
                              sizeof(Py_UNICODE) * (v->length + 1));
Georg Brandl's avatar
Georg Brandl committed
9215 9216 9217
}

PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson's avatar
Benjamin Peterson committed
9218
             "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandl's avatar
Georg Brandl committed
9219

9220 9221
static PyObject *
unicode_getnewargs(PyUnicodeObject *v)
9222
{
9223
    return Py_BuildValue("(u#)", v->str, v->length);
9224
}
9225

9226
static PyMethodDef unicode_methods[] = {
9227
    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264
    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl's avatar
Georg Brandl committed
9265
    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
9266
    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
9267
    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
9268
    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
9269
    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
9270 9271
    {"maketrans", (PyCFunction) unicode_maketrans,
     METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandl's avatar
Georg Brandl committed
9272
    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
9273 9274 9275
#if 0
    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
#endif
9276

9277
#if 0
9278
    /* These methods are just used for debugging the implementation. */
9279
    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
9280
    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
9281
#endif
9282

9283
    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
9284 9285
    {NULL, NULL}
};
9286

9287 9288 9289
static PyObject *
unicode_mod(PyObject *v, PyObject *w)
{
Benjamin Peterson's avatar
Benjamin Peterson committed
9290 9291 9292 9293 9294
    if (!PyUnicode_Check(v)) {
        Py_INCREF(Py_NotImplemented);
        return Py_NotImplemented;
    }
    return PyUnicode_Format(v, w);
9295
}
9296

9297
static PyNumberMethods unicode_as_number = {
9298 9299 9300 9301
    0,              /*nb_add*/
    0,              /*nb_subtract*/
    0,              /*nb_multiply*/
    unicode_mod,            /*nb_remainder*/
9302 9303 9304
};

static PySequenceMethods unicode_as_sequence = {
9305 9306 9307 9308 9309 9310 9311 9312
    (lenfunc) unicode_length,       /* sq_length */
    PyUnicode_Concat,           /* sq_concat */
    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
    (ssizeargfunc) unicode_getitem,     /* sq_item */
    0,                  /* sq_slice */
    0,                  /* sq_ass_item */
    0,                  /* sq_ass_slice */
    PyUnicode_Contains,         /* sq_contains */
9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330
};

static PyObject*
unicode_subscript(PyUnicodeObject* self, PyObject* item)
{
    if (PyIndex_Check(item)) {
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
        if (i == -1 && PyErr_Occurred())
            return NULL;
        if (i < 0)
            i += PyUnicode_GET_SIZE(self);
        return unicode_getitem(self, i);
    } else if (PySlice_Check(item)) {
        Py_ssize_t start, stop, step, slicelength, cur, i;
        Py_UNICODE* source_buf;
        Py_UNICODE* result_buf;
        PyObject* result;

Martin v. Löwis's avatar
Martin v. Löwis committed
9331
        if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
Benjamin Peterson's avatar
Benjamin Peterson committed
9332
                                 &start, &stop, &step, &slicelength) < 0) {
9333 9334 9335 9336 9337
            return NULL;
        }

        if (slicelength <= 0) {
            return PyUnicode_FromUnicode(NULL, 0);
9338 9339 9340 9341 9342 9343
        } else if (start == 0 && step == 1 && slicelength == self->length &&
                   PyUnicode_CheckExact(self)) {
            Py_INCREF(self);
            return (PyObject *)self;
        } else if (step == 1) {
            return PyUnicode_FromUnicode(self->str + start, slicelength);
9344 9345
        } else {
            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
Christian Heimes's avatar
Christian Heimes committed
9346 9347
            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
                                                       sizeof(Py_UNICODE));
9348

Benjamin Peterson's avatar
Benjamin Peterson committed
9349 9350
            if (result_buf == NULL)
                return PyErr_NoMemory();
9351 9352 9353 9354 9355 9356

            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
                result_buf[i] = source_buf[cur];
            }

            result = PyUnicode_FromUnicode(result_buf, slicelength);
Christian Heimes's avatar
Christian Heimes committed
9357
            PyObject_FREE(result_buf);
9358 9359 9360 9361 9362
            return result;
        }
    } else {
        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
        return NULL;
9363 9364 9365
    }
}

9366
static PyMappingMethods unicode_as_mapping = {
9367 9368 9369
    (lenfunc)unicode_length,        /* mp_length */
    (binaryfunc)unicode_subscript,  /* mp_subscript */
    (objobjargproc)0,           /* mp_ass_subscript */
9370 9371
};

9372

9373 9374
/* Helpers for PyUnicode_Format() */

9375
static PyObject *
9376
getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
9377
{
9378 9379
    Py_ssize_t argidx = *p_argidx;
    if (argidx < arglen) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9380 9381 9382 9383 9384
        (*p_argidx)++;
        if (arglen < 0)
            return args;
        else
            return PyTuple_GetItem(args, argidx);
9385 9386
    }
    PyErr_SetString(PyExc_TypeError,
Benjamin Peterson's avatar
Benjamin Peterson committed
9387
                    "not enough arguments for format string");
9388 9389
    return NULL;
}
9390

9391
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
9392

9393 9394 9395 9396 9397
static PyObject *
formatfloat(PyObject *v, int flags, int prec, int type)
{
    char *p;
    PyObject *result;
9398
    double x;
9399

9400 9401
    x = PyFloat_AsDouble(v);
    if (x == -1.0 && PyErr_Occurred())
9402 9403
        return NULL;

9404
    if (prec < 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
9405
        prec = 6;
9406 9407 9408

    p = PyOS_double_to_string(x, type, prec,
                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9409 9410 9411
    if (p == NULL)
        return NULL;
    result = PyUnicode_FromStringAndSize(p, strlen(p));
9412 9413
    PyMem_Free(p);
    return result;
9414 9415
}

9416 9417 9418
static PyObject*
formatlong(PyObject *val, int flags, int prec, int type)
{
9419 9420 9421 9422
    char *buf;
    int len;
    PyObject *str; /* temporary string object. */
    PyObject *result;
9423

9424 9425 9426 9427 9428 9429
    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
    if (!str)
        return NULL;
    result = PyUnicode_FromStringAndSize(buf, len);
    Py_DECREF(str);
    return result;
9430 9431 9432 9433 9434 9435
}

static int
formatchar(Py_UNICODE *buf,
           size_t buflen,
           PyObject *v)
9436
{
9437
    /* presume that the buffer is at least 3 characters long */
9438
    if (PyUnicode_Check(v)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9439 9440 9441 9442 9443
        if (PyUnicode_GET_SIZE(v) == 1) {
            buf[0] = PyUnicode_AS_UNICODE(v)[0];
            buf[1] = '\0';
            return 1;
        }
9444
#ifndef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455
        if (PyUnicode_GET_SIZE(v) == 2) {
            /* Decode a valid surrogate pair */
            int c0 = PyUnicode_AS_UNICODE(v)[0];
            int c1 = PyUnicode_AS_UNICODE(v)[1];
            if (0xD800 <= c0 && c0 <= 0xDBFF &&
                0xDC00 <= c1 && c1 <= 0xDFFF) {
                buf[0] = c0;
                buf[1] = c1;
                buf[2] = '\0';
                return 2;
            }
9456
        }
9457
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
9458
        goto onError;
9459 9460
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
9461
        /* Integer input truncated to a character */
9462
        long x;
Benjamin Peterson's avatar
Benjamin Peterson committed
9463 9464 9465
        x = PyLong_AsLong(v);
        if (x == -1 && PyErr_Occurred())
            goto onError;
9466

Benjamin Peterson's avatar
Benjamin Peterson committed
9467 9468 9469 9470 9471
        if (x < 0 || x > 0x10ffff) {
            PyErr_SetString(PyExc_OverflowError,
                            "%c arg not in range(0x110000)");
            return -1;
        }
9472 9473

#ifndef Py_UNICODE_WIDE
Benjamin Peterson's avatar
Benjamin Peterson committed
9474 9475 9476 9477 9478 9479
        if (x > 0xffff) {
            x -= 0x10000;
            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
            return 2;
        }
9480
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
9481 9482 9483
        buf[0] = (Py_UNICODE) x;
        buf[1] = '\0';
        return 1;
9484 9485
    }

Benjamin Peterson's avatar
Benjamin Peterson committed
9486
  onError:
9487
    PyErr_SetString(PyExc_TypeError,
Benjamin Peterson's avatar
Benjamin Peterson committed
9488
                    "%c requires int or char");
9489
    return -1;
9490 9491
}

9492
/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9493
   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9494
*/
9495
#define FORMATBUFLEN (size_t)10
9496

9497
PyObject *PyUnicode_Format(PyObject *format,
Benjamin Peterson's avatar
Benjamin Peterson committed
9498
                           PyObject *args)
9499
{
9500 9501 9502 9503 9504 9505
    Py_UNICODE *fmt, *res;
    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
    int args_owned = 0;
    PyUnicodeObject *result = NULL;
    PyObject *dict = NULL;
    PyObject *uformat;
9506

9507
    if (format == NULL || args == NULL) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9508 9509
        PyErr_BadInternalCall();
        return NULL;
9510 9511 9512
    }
    uformat = PyUnicode_FromObject(format);
    if (uformat == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
9513
        return NULL;
9514 9515
    fmt = PyUnicode_AS_UNICODE(uformat);
    fmtcnt = PyUnicode_GET_SIZE(uformat);
9516

9517 9518 9519
    reslen = rescnt = fmtcnt + 100;
    result = _PyUnicode_New(reslen);
    if (result == NULL)
Benjamin Peterson's avatar
Benjamin Peterson committed
9520
        goto onError;
9521
    res = PyUnicode_AS_UNICODE(result);
9522

9523
    if (PyTuple_Check(args)) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9524 9525
        arglen = PyTuple_Size(args);
        argidx = 0;
9526 9527
    }
    else {
Benjamin Peterson's avatar
Benjamin Peterson committed
9528 9529
        arglen = -1;
        argidx = -2;
9530
    }
9531
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson's avatar
Benjamin Peterson committed
9532
        dict = args;
9533

9534
    while (--fmtcnt >= 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9535 9536 9537 9538 9539 9540 9541 9542 9543 9544
        if (*fmt != '%') {
            if (--rescnt < 0) {
                rescnt = fmtcnt + 100;
                reslen += rescnt;
                if (_PyUnicode_Resize(&result, reslen) < 0)
                    goto onError;
                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
                --rescnt;
            }
            *res++ = *fmt++;
9545
        }
Benjamin Peterson's avatar
Benjamin Peterson committed
9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558
        else {
            /* Got a format specifier */
            int flags = 0;
            Py_ssize_t width = -1;
            int prec = -1;
            Py_UNICODE c = '\0';
            Py_UNICODE fill;
            int isnumok;
            PyObject *v = NULL;
            PyObject *temp = NULL;
            Py_UNICODE *pbuf;
            Py_UNICODE sign;
            Py_ssize_t len;
9559
            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
Benjamin Peterson's avatar
Benjamin Peterson committed
9560

9561
            fmt++;
Benjamin Peterson's avatar
Benjamin Peterson committed
9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589
            if (*fmt == '(') {
                Py_UNICODE *keystart;
                Py_ssize_t keylen;
                PyObject *key;
                int pcount = 1;

                if (dict == NULL) {
                    PyErr_SetString(PyExc_TypeError,
                                    "format requires a mapping");
                    goto onError;
                }
                ++fmt;
                --fmtcnt;
                keystart = fmt;
                /* Skip over balanced parentheses */
                while (pcount > 0 && --fmtcnt >= 0) {
                    if (*fmt == ')')
                        --pcount;
                    else if (*fmt == '(')
                        ++pcount;
                    fmt++;
                }
                keylen = fmt - keystart - 1;
                if (fmtcnt < 0 || pcount > 0) {
                    PyErr_SetString(PyExc_ValueError,
                                    "incomplete format key");
                    goto onError;
                }
9590
#if 0
Benjamin Peterson's avatar
Benjamin Peterson committed
9591 9592 9593 9594 9595 9596 9597
                /* keys are converted to strings using UTF-8 and
                   then looked up since Python uses strings to hold
                   variables names etc. in its namespaces and we
                   wouldn't want to break common idioms. */
                key = PyUnicode_EncodeUTF8(keystart,
                                           keylen,
                                           NULL);
9598
#else
Benjamin Peterson's avatar
Benjamin Peterson committed
9599
                key = PyUnicode_FromUnicode(keystart, keylen);
9600
#endif
Benjamin Peterson's avatar
Benjamin Peterson committed
9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614
                if (key == NULL)
                    goto onError;
                if (args_owned) {
                    Py_DECREF(args);
                    args_owned = 0;
                }
                args = PyObject_GetItem(dict, key);
                Py_DECREF(key);
                if (args == NULL) {
                    goto onError;
                }
                args_owned = 1;
                arglen = -1;
                argidx = -2;
9615 9616
            }
            while (--fmtcnt >= 0) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9617 9618 9619 9620 9621 9622 9623
                switch (c = *fmt++) {
                case '-': flags |= F_LJUST; continue;
                case '+': flags |= F_SIGN; continue;
                case ' ': flags |= F_BLANK; continue;
                case '#': flags |= F_ALT; continue;
                case '0': flags |= F_ZERO; continue;
                }
9624
                break;
Benjamin Peterson's avatar
Benjamin Peterson committed
9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650
            }
            if (c == '*') {
                v = getnextarg(args, arglen, &argidx);
                if (v == NULL)
                    goto onError;
                if (!PyLong_Check(v)) {
                    PyErr_SetString(PyExc_TypeError,
                                    "* wants int");
                    goto onError;
                }
                width = PyLong_AsLong(v);
                if (width == -1 && PyErr_Occurred())
                    goto onError;
                if (width < 0) {
                    flags |= F_LJUST;
                    width = -width;
                }
                if (--fmtcnt >= 0)
                    c = *fmt++;
            }
            else if (c >= '0' && c <= '9') {
                width = c - '0';
                while (--fmtcnt >= 0) {
                    c = *fmt++;
                    if (c < '0' || c > '9')
                        break;
9651
                    if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682
                        PyErr_SetString(PyExc_ValueError,
                                        "width too big");
                        goto onError;
                    }
                    width = width*10 + (c - '0');
                }
            }
            if (c == '.') {
                prec = 0;
                if (--fmtcnt >= 0)
                    c = *fmt++;
                if (c == '*') {
                    v = getnextarg(args, arglen, &argidx);
                    if (v == NULL)
                        goto onError;
                    if (!PyLong_Check(v)) {
                        PyErr_SetString(PyExc_TypeError,
                                        "* wants int");
                        goto onError;
                    }
                    prec = PyLong_AsLong(v);
                    if (prec == -1 && PyErr_Occurred())
                        goto onError;
                    if (prec < 0)
                        prec = 0;
                    if (--fmtcnt >= 0)
                        c = *fmt++;
                }
                else if (c >= '0' && c <= '9') {
                    prec = c - '0';
                    while (--fmtcnt >= 0) {
9683
                        c = *fmt++;
Benjamin Peterson's avatar
Benjamin Peterson committed
9684 9685
                        if (c < '0' || c > '9')
                            break;
9686
                        if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701
                            PyErr_SetString(PyExc_ValueError,
                                            "prec too big");
                            goto onError;
                        }
                        prec = prec*10 + (c - '0');
                    }
                }
            } /* prec */
            if (fmtcnt >= 0) {
                if (c == 'h' || c == 'l' || c == 'L') {
                    if (--fmtcnt >= 0)
                        c = *fmt++;
                }
            }
            if (fmtcnt < 0) {
9702
                PyErr_SetString(PyExc_ValueError,
Benjamin Peterson's avatar
Benjamin Peterson committed
9703
                                "incomplete format");
9704 9705
                goto onError;
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
9706 9707 9708 9709
            if (c != '%') {
                v = getnextarg(args, arglen, &argidx);
                if (v == NULL)
                    goto onError;
9710
            }
Benjamin Peterson's avatar
Benjamin Peterson committed
9711 9712 9713
            sign = 0;
            fill = ' ';
            switch (c) {
9714

Benjamin Peterson's avatar
Benjamin Peterson committed
9715 9716 9717 9718 9719 9720
            case '%':
                pbuf = formatbuf;
                /* presume that buffer length is at least 1 */
                pbuf[0] = '%';
                len = 1;
                break;
9721

Benjamin Peterson's avatar
Benjamin Peterson committed
9722 9723 9724
            case 's':
            case 'r':
            case 'a':
9725
                if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson's avatar
Benjamin Peterson committed
9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737
                    temp = v;
                    Py_INCREF(temp);
                }
                else {
                    if (c == 's')
                        temp = PyObject_Str(v);
                    else if (c == 'r')
                        temp = PyObject_Repr(v);
                    else
                        temp = PyObject_ASCII(v);
                    if (temp == NULL)
                        goto onError;
9738 9739
                    if (PyUnicode_Check(temp))
                        /* nothing to do */;
Benjamin Peterson's avatar
Benjamin Peterson committed
9740 9741 9742 9743
                    else {
                        Py_DECREF(temp);
                        PyErr_SetString(PyExc_TypeError,
                                        "%s argument has non-string str()");
9744
                        goto onError;
Benjamin Peterson's avatar
Benjamin Peterson committed
9745
                    }
9746
                }
Benjamin Peterson's avatar
Benjamin Peterson committed
9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772
                pbuf = PyUnicode_AS_UNICODE(temp);
                len = PyUnicode_GET_SIZE(temp);
                if (prec >= 0 && len > prec)
                    len = prec;
                break;

            case 'i':
            case 'd':
            case 'u':
            case 'o':
            case 'x':
            case 'X':
                isnumok = 0;
                if (PyNumber_Check(v)) {
                    PyObject *iobj=NULL;

                    if (PyLong_Check(v)) {
                        iobj = v;
                        Py_INCREF(iobj);
                    }
                    else {
                        iobj = PyNumber_Long(v);
                    }
                    if (iobj!=NULL) {
                        if (PyLong_Check(iobj)) {
                            isnumok = 1;
9773
                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson's avatar
Benjamin Peterson committed
9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784
                            Py_DECREF(iobj);
                            if (!temp)
                                goto onError;
                            pbuf = PyUnicode_AS_UNICODE(temp);
                            len = PyUnicode_GET_SIZE(temp);
                            sign = 1;
                        }
                        else {
                            Py_DECREF(iobj);
                        }
                    }
9785
                }
Benjamin Peterson's avatar
Benjamin Peterson committed
9786 9787 9788 9789 9790 9791 9792 9793 9794
                if (!isnumok) {
                    PyErr_Format(PyExc_TypeError,
                                 "%%%c format: a number is required, "
                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
                    goto onError;
                }
                if (flags & F_ZERO)
                    fill = '0';
                break;
9795

Benjamin Peterson's avatar
Benjamin Peterson committed
9796 9797 9798 9799 9800 9801
            case 'e':
            case 'E':
            case 'f':
            case 'F':
            case 'g':
            case 'G':
9802 9803
                temp = formatfloat(v, flags, prec, c);
                if (!temp)
Benjamin Peterson's avatar
Benjamin Peterson committed
9804
                    goto onError;
9805 9806
                pbuf = PyUnicode_AS_UNICODE(temp);
                len = PyUnicode_GET_SIZE(temp);
Benjamin Peterson's avatar
Benjamin Peterson committed
9807 9808 9809 9810
                sign = 1;
                if (flags & F_ZERO)
                    fill = '0';
                break;
9811

Benjamin Peterson's avatar
Benjamin Peterson committed
9812 9813 9814 9815 9816 9817
            case 'c':
                pbuf = formatbuf;
                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
                if (len < 0)
                    goto onError;
                break;
9818

Benjamin Peterson's avatar
Benjamin Peterson committed
9819 9820 9821 9822 9823
            default:
                PyErr_Format(PyExc_ValueError,
                             "unsupported format character '%c' (0x%x) "
                             "at index %zd",
                             (31<=c && c<=126) ? (char)c : '?',
9824
                             (int)c,
Benjamin Peterson's avatar
Benjamin Peterson committed
9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904
                             (Py_ssize_t)(fmt - 1 -
                                          PyUnicode_AS_UNICODE(uformat)));
                goto onError;
            }
            if (sign) {
                if (*pbuf == '-' || *pbuf == '+') {
                    sign = *pbuf++;
                    len--;
                }
                else if (flags & F_SIGN)
                    sign = '+';
                else if (flags & F_BLANK)
                    sign = ' ';
                else
                    sign = 0;
            }
            if (width < len)
                width = len;
            if (rescnt - (sign != 0) < width) {
                reslen -= rescnt;
                rescnt = width + fmtcnt + 100;
                reslen += rescnt;
                if (reslen < 0) {
                    Py_XDECREF(temp);
                    PyErr_NoMemory();
                    goto onError;
                }
                if (_PyUnicode_Resize(&result, reslen) < 0) {
                    Py_XDECREF(temp);
                    goto onError;
                }
                res = PyUnicode_AS_UNICODE(result)
                    + reslen - rescnt;
            }
            if (sign) {
                if (fill != ' ')
                    *res++ = sign;
                rescnt--;
                if (width > len)
                    width--;
            }
            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
                assert(pbuf[0] == '0');
                assert(pbuf[1] == c);
                if (fill != ' ') {
                    *res++ = *pbuf++;
                    *res++ = *pbuf++;
                }
                rescnt -= 2;
                width -= 2;
                if (width < 0)
                    width = 0;
                len -= 2;
            }
            if (width > len && !(flags & F_LJUST)) {
                do {
                    --rescnt;
                    *res++ = fill;
                } while (--width > len);
            }
            if (fill == ' ') {
                if (sign)
                    *res++ = sign;
                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
                    assert(pbuf[0] == '0');
                    assert(pbuf[1] == c);
                    *res++ = *pbuf++;
                    *res++ = *pbuf++;
                }
            }
            Py_UNICODE_COPY(res, pbuf, len);
            res += len;
            rescnt -= len;
            while (--width >= len) {
                --rescnt;
                *res++ = ' ';
            }
            if (dict && (argidx < arglen) && c != '%') {
                PyErr_SetString(PyExc_TypeError,
                                "not all arguments converted during string formatting");
9905
                Py_XDECREF(temp);
Benjamin Peterson's avatar
Benjamin Peterson committed
9906 9907 9908 9909
                goto onError;
            }
            Py_XDECREF(temp);
        } /* '%' */
9910 9911
    } /* until end */
    if (argidx < arglen && !dict) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9912 9913 9914
        PyErr_SetString(PyExc_TypeError,
                        "not all arguments converted during string formatting");
        goto onError;
9915
    }
9916

9917
    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
9918
        goto onError;
9919
    if (args_owned) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9920
        Py_DECREF(args);
9921 9922 9923
    }
    Py_DECREF(uformat);
    return (PyObject *)result;
9924

Benjamin Peterson's avatar
Benjamin Peterson committed
9925
  onError:
9926 9927 9928
    Py_XDECREF(result);
    Py_DECREF(uformat);
    if (args_owned) {
Benjamin Peterson's avatar
Benjamin Peterson committed
9929
        Py_DECREF(args);
9930 9931 9932
    }
    return NULL;
}
9933

9934 9935
static PyObject *
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9936

9937 9938 9939
static PyObject *
unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
Benjamin Peterson's avatar
Benjamin Peterson committed
9940
    PyObject *x = NULL;
9941 9942 9943 9944 9945 9946 9947
    static char *kwlist[] = {"object", "encoding", "errors", 0};
    char *encoding = NULL;
    char *errors = NULL;

    if (type != &PyUnicode_Type)
        return unicode_subtype_new(type, args, kwds);
    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson's avatar
Benjamin Peterson committed
9948
                                     kwlist, &x, &encoding, &errors))
9949 9950 9951 9952 9953 9954
        return NULL;
    if (x == NULL)
        return (PyObject *)_PyUnicode_New(0);
    if (encoding == NULL && errors == NULL)
        return PyObject_Str(x);
    else
Benjamin Peterson's avatar
Benjamin Peterson committed
9955
        return PyUnicode_FromEncodedObject(x, encoding, errors);
9956 9957
}

9958 9959
static PyObject *
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9960
{
9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985
    PyUnicodeObject *tmp, *pnew;
    Py_ssize_t n;

    assert(PyType_IsSubtype(type, &PyUnicode_Type));
    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
    if (tmp == NULL)
        return NULL;
    assert(PyUnicode_Check(tmp));
    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
    if (pnew == NULL) {
        Py_DECREF(tmp);
        return NULL;
    }
    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
    if (pnew->str == NULL) {
        _Py_ForgetReference((PyObject *)pnew);
        PyObject_Del(pnew);
        Py_DECREF(tmp);
        return PyErr_NoMemory();
    }
    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
    pnew->length = n;
    pnew->hash = tmp->hash;
    Py_DECREF(tmp);
    return (PyObject *)pnew;
9986
}
9987

9988
PyDoc_STRVAR(unicode_doc,
9989 9990
"str(object='') -> str\n\
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
9991
\n\
Nick Coghlan's avatar
Nick Coghlan committed
9992 9993 9994 9995 9996 9997 9998
Create a new string object from the given object. If encoding or\n\
errors is specified, then the object must expose a data buffer\n\
that will be decoded using the given encoding and error handler.\n\
Otherwise, returns the result of object.__str__() (if defined)\n\
or repr(object).\n\
encoding defaults to sys.getdefaultencoding().\n\
errors defaults to 'strict'.");
9999

10000
static PyObject *unicode_iter(PyObject *seq);
10001

10002 10003
PyTypeObject PyUnicode_Type = {
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10004 10005 10006
    "str",              /* tp_name */
    sizeof(PyUnicodeObject),        /* tp_size */
    0,                  /* tp_itemsize */
10007
    /* Slots */
10008 10009 10010 10011
    (destructor)unicode_dealloc,    /* tp_dealloc */
    0,                  /* tp_print */
    0,                  /* tp_getattr */
    0,                  /* tp_setattr */
10012
    0,                  /* tp_reserved */
10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023
    unicode_repr,           /* tp_repr */
    &unicode_as_number,         /* tp_as_number */
    &unicode_as_sequence,       /* tp_as_sequence */
    &unicode_as_mapping,        /* tp_as_mapping */
    (hashfunc) unicode_hash,        /* tp_hash*/
    0,                  /* tp_call*/
    (reprfunc) unicode_str,     /* tp_str */
    PyObject_GenericGetAttr,        /* tp_getattro */
    0,                  /* tp_setattro */
    0,                  /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson's avatar
Benjamin Peterson committed
10024
    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043
    unicode_doc,            /* tp_doc */
    0,                  /* tp_traverse */
    0,                  /* tp_clear */
    PyUnicode_RichCompare,      /* tp_richcompare */
    0,                  /* tp_weaklistoffset */
    unicode_iter,           /* tp_iter */
    0,                  /* tp_iternext */
    unicode_methods,            /* tp_methods */
    0,                  /* tp_members */
    0,                  /* tp_getset */
    &PyBaseObject_Type,         /* tp_base */
    0,                  /* tp_dict */
    0,                  /* tp_descr_get */
    0,                  /* tp_descr_set */
    0,                  /* tp_dictoffset */
    0,                  /* tp_init */
    0,                  /* tp_alloc */
    unicode_new,            /* tp_new */
    PyObject_Del,           /* tp_free */
10044
};
10045

10046
/* Initialize the Unicode implementation */
10047

10048 10049 10050
void _PyUnicode_Init(void)
{
    int i;
10051

10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064
    /* XXX - move this array to unicodectype.c ? */
    Py_UNICODE linebreak[] = {
        0x000A, /* LINE FEED */
        0x000D, /* CARRIAGE RETURN */
        0x001C, /* FILE SEPARATOR */
        0x001D, /* GROUP SEPARATOR */
        0x001E, /* RECORD SEPARATOR */
        0x0085, /* NEXT LINE */
        0x2028, /* LINE SEPARATOR */
        0x2029, /* PARAGRAPH SEPARATOR */
    };

    /* Init the implementation */
Christian Heimes's avatar
Christian Heimes committed
10065 10066
    free_list = NULL;
    numfree = 0;
10067 10068
    unicode_empty = _PyUnicode_New(0);
    if (!unicode_empty)
Benjamin Peterson's avatar
Benjamin Peterson committed
10069
        return;
10070

10071
    for (i = 0; i < 256; i++)
Benjamin Peterson's avatar
Benjamin Peterson committed
10072
        unicode_latin1[i] = NULL;
10073
    if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson's avatar
Benjamin Peterson committed
10074
        Py_FatalError("Can't initialize 'unicode'");
10075

10076 10077 10078 10079
    /* initialize the linebreak bloom filter */
    bloom_linebreak = make_bloom_mask(
        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
        );
10080

10081
    PyType_Ready(&EncodingMapType);
10082 10083
}

10084 10085
/* Finalize the Unicode implementation */

Christian Heimes's avatar
Christian Heimes committed
10086 10087 10088 10089 10090 10091 10092
int
PyUnicode_ClearFreeList(void)
{
    int freelist_size = numfree;
    PyUnicodeObject *u;

    for (u = free_list; u != NULL;) {
Benjamin Peterson's avatar
Benjamin Peterson committed
10093 10094 10095 10096 10097 10098 10099
        PyUnicodeObject *v = u;
        u = *(PyUnicodeObject **)u;
        if (v->str)
            PyObject_DEL(v->str);
        Py_XDECREF(v->defenc);
        PyObject_Del(v);
        numfree--;
Christian Heimes's avatar
Christian Heimes committed
10100 10101 10102 10103 10104 10105
    }
    free_list = NULL;
    assert(numfree == 0);
    return freelist_size;
}

10106 10107
void
_PyUnicode_Fini(void)
10108
{
10109
    int i;
10110

10111 10112
    Py_XDECREF(unicode_empty);
    unicode_empty = NULL;
10113

10114
    for (i = 0; i < 256; i++) {
Benjamin Peterson's avatar
Benjamin Peterson committed
10115 10116 10117 10118
        if (unicode_latin1[i]) {
            Py_DECREF(unicode_latin1[i]);
            unicode_latin1[i] = NULL;
        }
10119
    }
Christian Heimes's avatar
Christian Heimes committed
10120
    (void)PyUnicode_ClearFreeList();
10121
}
10122

10123 10124 10125
void
PyUnicode_InternInPlace(PyObject **p)
{
10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147
    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
    PyObject *t;
    if (s == NULL || !PyUnicode_Check(s))
        Py_FatalError(
            "PyUnicode_InternInPlace: unicode strings only please!");
    /* If it's a subclass, we don't really know what putting
       it in the interned dict might do. */
    if (!PyUnicode_CheckExact(s))
        return;
    if (PyUnicode_CHECK_INTERNED(s))
        return;
    if (interned == NULL) {
        interned = PyDict_New();
        if (interned == NULL) {
            PyErr_Clear(); /* Don't leave an exception */
            return;
        }
    }
    /* It might be that the GetItem call fails even
       though the key is present in the dictionary,
       namely when this happens during a stack overflow. */
    Py_ALLOW_RECURSION
Benjamin Peterson's avatar
Benjamin Peterson committed
10148
        t = PyDict_GetItem(interned, (PyObject *)s);
10149 10150
    Py_END_ALLOW_RECURSION

Benjamin Peterson's avatar
Benjamin Peterson committed
10151 10152 10153 10154 10155 10156
        if (t) {
            Py_INCREF(t);
            Py_DECREF(*p);
            *p = t;
            return;
        }
10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168

    PyThreadState_GET()->recursion_critical = 1;
    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
        PyErr_Clear();
        PyThreadState_GET()->recursion_critical = 0;
        return;
    }
    PyThreadState_GET()->recursion_critical = 0;
    /* The two references in interned are not counted by refcnt.
       The deallocator will take care of this */
    Py_REFCNT(s) -= 2;
    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
10169 10170
}

10171 10172 10173
void
PyUnicode_InternImmortal(PyObject **p)
{
10174 10175 10176 10177 10178
    PyUnicode_InternInPlace(p);
    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
        Py_INCREF(*p);
    }
10179
}
10180 10181

PyObject *
10182
PyUnicode_InternFromString(const char *cp)
10183
{
10184 10185 10186 10187 10188
    PyObject *s = PyUnicode_FromString(cp);
    if (s == NULL)
        return NULL;
    PyUnicode_InternInPlace(&s);
    return s;
10189
}
10190

10191 10192
void _Py_ReleaseInternedUnicodeStrings(void)
{
10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212
    PyObject *keys;
    PyUnicodeObject *s;
    Py_ssize_t i, n;
    Py_ssize_t immortal_size = 0, mortal_size = 0;

    if (interned == NULL || !PyDict_Check(interned))
        return;
    keys = PyDict_Keys(interned);
    if (keys == NULL || !PyList_Check(keys)) {
        PyErr_Clear();
        return;
    }

    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
       detector, interned unicode strings are not forcibly deallocated;
       rather, we give them their stolen references back, and then clear
       and DECREF the interned dict. */

    n = PyList_GET_SIZE(keys);
    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson's avatar
Benjamin Peterson committed
10213
            n);
10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239
    for (i = 0; i < n; i++) {
        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
        switch (s->state) {
        case SSTATE_NOT_INTERNED:
            /* XXX Shouldn't happen */
            break;
        case SSTATE_INTERNED_IMMORTAL:
            Py_REFCNT(s) += 1;
            immortal_size += s->length;
            break;
        case SSTATE_INTERNED_MORTAL:
            Py_REFCNT(s) += 2;
            mortal_size += s->length;
            break;
        default:
            Py_FatalError("Inconsistent interned string state.");
        }
        s->state = SSTATE_NOT_INTERNED;
    }
    fprintf(stderr, "total size of all interned strings: "
            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
            "mortal/immortal\n", mortal_size, immortal_size);
    Py_DECREF(keys);
    PyDict_Clear(interned);
    Py_DECREF(interned);
    interned = NULL;
10240
}
10241

10242

10243 10244 10245
/********************* Unicode Iterator **************************/

typedef struct {
10246 10247 10248
    PyObject_HEAD
    Py_ssize_t it_index;
    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
10249 10250 10251 10252 10253
} unicodeiterobject;

static void
unicodeiter_dealloc(unicodeiterobject *it)
{
10254 10255 10256
    _PyObject_GC_UNTRACK(it);
    Py_XDECREF(it->it_seq);
    PyObject_GC_Del(it);
10257 10258 10259 10260 10261
}

static int
unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
{
10262 10263
    Py_VISIT(it->it_seq);
    return 0;
10264 10265 10266 10267 10268
}

static PyObject *
unicodeiter_next(unicodeiterobject *it)
{
10269 10270
    PyUnicodeObject *seq;
    PyObject *item;
10271

10272 10273 10274 10275 10276
    assert(it != NULL);
    seq = it->it_seq;
    if (seq == NULL)
        return NULL;
    assert(PyUnicode_Check(seq));
10277

10278 10279
    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
        item = PyUnicode_FromUnicode(
Benjamin Peterson's avatar
Benjamin Peterson committed
10280
            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
10281 10282 10283 10284
        if (item != NULL)
            ++it->it_index;
        return item;
    }
10285

10286 10287 10288
    Py_DECREF(seq);
    it->it_seq = NULL;
    return NULL;
10289 10290 10291 10292 10293
}

static PyObject *
unicodeiter_len(unicodeiterobject *it)
{
10294 10295 10296 10297
    Py_ssize_t len = 0;
    if (it->it_seq)
        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
    return PyLong_FromSsize_t(len);
10298 10299 10300 10301 10302
}

PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");

static PyMethodDef unicodeiter_methods[] = {
10303
    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson's avatar
Benjamin Peterson committed
10304
     length_hint_doc},
10305
    {NULL,      NULL}       /* sentinel */
10306 10307 10308
};

PyTypeObject PyUnicodeIter_Type = {
10309 10310 10311 10312 10313 10314 10315 10316 10317
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
    "str_iterator",         /* tp_name */
    sizeof(unicodeiterobject),      /* tp_basicsize */
    0,                  /* tp_itemsize */
    /* methods */
    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
    0,                  /* tp_print */
    0,                  /* tp_getattr */
    0,                  /* tp_setattr */
10318
    0,                  /* tp_reserved */
10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338
    0,                  /* tp_repr */
    0,                  /* tp_as_number */
    0,                  /* tp_as_sequence */
    0,                  /* tp_as_mapping */
    0,                  /* tp_hash */
    0,                  /* tp_call */
    0,                  /* tp_str */
    PyObject_GenericGetAttr,        /* tp_getattro */
    0,                  /* tp_setattro */
    0,                  /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
    0,                  /* tp_doc */
    (traverseproc)unicodeiter_traverse, /* tp_traverse */
    0,                  /* tp_clear */
    0,                  /* tp_richcompare */
    0,                  /* tp_weaklistoffset */
    PyObject_SelfIter,          /* tp_iter */
    (iternextfunc)unicodeiter_next,     /* tp_iternext */
    unicodeiter_methods,            /* tp_methods */
    0,
10339 10340 10341 10342 10343
};

static PyObject *
unicode_iter(PyObject *seq)
{
10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357
    unicodeiterobject *it;

    if (!PyUnicode_Check(seq)) {
        PyErr_BadInternalCall();
        return NULL;
    }
    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
    if (it == NULL)
        return NULL;
    it->it_index = 0;
    Py_INCREF(seq);
    it->it_seq = (PyUnicodeObject *)seq;
    _PyObject_GC_TRACK(it);
    return (PyObject *)it;
10358 10359
}

10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386
size_t
Py_UNICODE_strlen(const Py_UNICODE *u)
{
    int res = 0;
    while(*u++)
        res++;
    return res;
}

Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
{
    Py_UNICODE *u = s1;
    while ((*u++ = *s2++));
    return s1;
}

Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
{
    Py_UNICODE *u = s1;
    while ((*u++ = *s2++))
        if (n-- == 0)
            break;
    return s1;
}

10387 10388 10389 10390 10391 10392 10393 10394 10395
Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
{
    Py_UNICODE *u1 = s1;
    u1 += Py_UNICODE_strlen(u1);
    Py_UNICODE_strcpy(u1, s2);
    return s1;
}

10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409
int
Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
{
    while (*s1 && *s2 && *s1 == *s2)
        s1++, s2++;
    if (*s1 && *s2)
        return (*s1 < *s2) ? -1 : +1;
    if (*s1)
        return 1;
    if (*s2)
        return -1;
    return 0;
}

10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426
int
Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
{
    register Py_UNICODE u1, u2;
    for (; n != 0; n--) {
        u1 = *s1;
        u2 = *s2;
        if (u1 != u2)
            return (u1 < u2) ? -1 : +1;
        if (u1 == '\0')
            return 0;
        s1++;
        s2++;
    }
    return 0;
}

10427 10428 10429 10430 10431 10432 10433 10434 10435 10436
Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
{
    const Py_UNICODE *p;
    for (p = s; *p; p++)
        if (*p == c)
            return (Py_UNICODE*)p;
    return NULL;
}

10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449
Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
{
    const Py_UNICODE *p;
    p = s + Py_UNICODE_strlen(s);
    while (p != s) {
        p--;
        if (*p == c)
            return (Py_UNICODE*)p;
    }
    return NULL;
}

10450
Py_UNICODE*
10451
PyUnicode_AsUnicodeCopy(PyObject *object)
10452 10453 10454 10455
{
    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
    Py_UNICODE *copy;
    Py_ssize_t size;
10456

10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470
    /* Ensure we won't overflow the size. */
    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
        PyErr_NoMemory();
        return NULL;
    }
    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
    size *= sizeof(Py_UNICODE);
    copy = PyMem_Malloc(size);
    if (copy == NULL) {
        PyErr_NoMemory();
        return NULL;
    }
    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
    return copy;
10471 10472
}

10473 10474
/* A _string module, to export formatter_parser and formatter_field_name_split
   to the string.Formatter class implemented in Python. */
10475

10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502
static PyMethodDef _string_methods[] = {
    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
     METH_O, PyDoc_STR("split the argument as a field name")},
    {"formatter_parser", (PyCFunction) formatter_parser,
     METH_O, PyDoc_STR("parse the argument as a format string")},
    {NULL, NULL}
};

static struct PyModuleDef _string_module = {
    PyModuleDef_HEAD_INIT,
    "_string",
    PyDoc_STR("string helper module"),
    0,
    _string_methods,
    NULL,
    NULL,
    NULL,
    NULL
};

PyMODINIT_FUNC
PyInit__string(void)
{
    return PyModule_Create(&_string_module);
}


10503 10504 10505
#ifdef __cplusplus
}
#endif