python_unicode.pxd 17.5 KB
Newer Older
1
cdef extern from *:
2
    ctypedef unsigned int Py_UNICODE
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95

    # Return true if the object o is a Unicode object or an instance
    # of a Unicode subtype. Changed in version 2.2: Allowed subtypes
    # to be accepted.
    bint PyUnicode_Check(object o)

    # Return true if the object o is a Unicode object, but not an
    # instance of a subtype. New in version 2.2.
    bint PyUnicode_CheckExact(object o)

    # Return the size of the object. o has to be a PyUnicodeObject
    # (not checked).
    Py_ssize_t PyUnicode_GET_SIZE(object o)

    # Return the size of the object's internal buffer in bytes. o has
    # to be a PyUnicodeObject (not checked).
    Py_ssize_t PyUnicode_GET_DATA_SIZE(object o)

    # Return a pointer to the internal Py_UNICODE buffer of the
    # object. o has to be a PyUnicodeObject (not checked).
    Py_UNICODE* PyUnicode_AS_UNICODE(object o)

    # Return a pointer to the internal buffer of the object. o has to
    # be a PyUnicodeObject (not checked).
    char* PyUnicode_AS_DATA(object o)

    # Return 1 or 0 depending on whether ch is a whitespace character.
    bint Py_UNICODE_ISSPACE(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a lowercase character.
    bint Py_UNICODE_ISLOWER(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is an uppercase character.
    bint Py_UNICODE_ISUPPER(Py_UNICODE ch)
    
    # Return 1 or 0 depending on whether ch is a titlecase character. 
    bint Py_UNICODE_ISTITLE(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a linebreak character. 
    bint Py_UNICODE_ISLINEBREAK(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a decimal character. 
    bint Py_UNICODE_ISDECIMAL(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a digit character. 
    bint Py_UNICODE_ISDIGIT(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is a numeric character. 
    bint Py_UNICODE_ISNUMERIC(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is an alphabetic character. 
    bint Py_UNICODE_ISALPHA(Py_UNICODE ch)

    # Return 1 or 0 depending on whether ch is an alphanumeric character. 
    bint Py_UNICODE_ISALNUM(Py_UNICODE ch)

    # Return the character ch converted to lower case. 
    Py_UNICODE Py_UNICODE_TOLOWER(Py_UNICODE ch)

    # Return the character ch converted to upper case. 
    Py_UNICODE Py_UNICODE_TOUPPER(Py_UNICODE ch)

    # Return the character ch converted to title case. 
    Py_UNICODE Py_UNICODE_TOTITLE(Py_UNICODE ch)

    # Return the character ch converted to a decimal positive
    # integer. Return -1 if this is not possible. This macro does not
    # raise exceptions.
    int Py_UNICODE_TODECIMAL(Py_UNICODE ch)

    # Return the character ch converted to a single digit
    # integer. Return -1 if this is not possible. This macro does not
    # raise exceptions.
    int Py_UNICODE_TODIGIT(Py_UNICODE ch)

    # Return the character ch converted to a double. Return -1.0 if
    # this is not possible. This macro does not raise exceptions.
    double Py_UNICODE_TONUMERIC(Py_UNICODE ch)

    # To create Unicode objects and access their basic sequence
    # properties, use these APIs:

    # Create a Unicode Object from the Py_UNICODE buffer u of the
    # given size. u may be NULL which causes the contents to be
    # undefined. It is the user's responsibility to fill in the needed
    # data. The buffer is copied into the new object. If the buffer is
    # not NULL, the return value might be a shared object. Therefore,
    # modification of the resulting Unicode object is only allowed
    # when u is NULL.
    object PyUnicode_FromUnicode(Py_UNICODE *u, Py_ssize_t size)

    # Return a read-only pointer to the Unicode object's internal
    # Py_UNICODE buffer, NULL if unicode is not a Unicode object.
Stefan Behnel's avatar
Stefan Behnel committed
96
    Py_UNICODE* PyUnicode_AsUnicode(object o) except NULL
97 98

    # Return the length of the Unicode object. 
Stefan Behnel's avatar
Stefan Behnel committed
99
    Py_ssize_t PyUnicode_GetSize(object o) except -1
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385

    # Coerce an encoded object obj to an Unicode object and return a
    # reference with incremented refcount.
    # String and other char buffer compatible objects are decoded
    # according to the given encoding and using the error handling
    # defined by errors. Both can be NULL to have the interface use
    # the default values (see the next section for details).
    # All other objects, including Unicode objects, cause a TypeError
    # to be set.
    object PyUnicode_FromEncodedObject(object o, char *encoding, char *errors)

    # Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict")
    # which is used throughout the interpreter whenever coercion to
    # Unicode is needed.
    object PyUnicode_FromObject(object obj)

    # If the platform supports wchar_t and provides a header file
    # wchar.h, Python can interface directly to this type using the
    # following functions. Support is optimized if Python's own
    # Py_UNICODE type is identical to the system's wchar_t.

    #ctypedef int wchar_t

    # Create a Unicode object from the wchar_t buffer w of the given
    # size. Return NULL on failure.
    #PyObject* PyUnicode_FromWideChar(wchar_t *w, Py_ssize_t size)

    #Py_ssize_t PyUnicode_AsWideChar(object o, wchar_t *w, Py_ssize_t size)

# Codecs

    # Create a Unicode object by decoding size bytes of the encoded
    # string s. encoding and errors have the same meaning as the
    # parameters of the same name in the unicode() builtin
    # function. The codec to be used is looked up using the Python
    # codec registry. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_Decode(char *s, Py_ssize_t size, char *encoding, char *errors)

    # Encode the Py_UNICODE buffer of the given size and return a
    # Python string object. encoding and errors have the same meaning
    # as the parameters of the same name in the Unicode encode()
    # method. The codec to be used is looked up using the Python codec
    # registry. Return NULL if an exception was raised by the codec.
    object PyUnicode_Encode(Py_UNICODE *s, Py_ssize_t size,
                            char *encoding, char *errors)

    # Encode a Unicode object and return the result as Python string
    # object. encoding and errors have the same meaning as the
    # parameters of the same name in the Unicode encode() method. The
    # codec to be used is looked up using the Python codec
    # registry. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsEncodedString(object unicode, char *encoding, char *errors)

# These are the UTF-8 codec APIs:

    # Create a Unicode object by decoding size bytes of the UTF-8
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeUTF8(char *s, Py_ssize_t size, char *errors)

    # If consumed is NULL, behave like PyUnicode_DecodeUTF8(). If
    # consumed is not NULL, trailing incomplete UTF-8 byte sequences
    # will not be treated as an error. Those bytes will not be decoded
    # and the number of bytes that have been decoded will be stored in
    # consumed. New in version 2.4.
    object PyUnicode_DecodeUTF8Stateful(char *s, Py_ssize_t size, char *errors, Py_ssize_t *consumed)

    # Encode the Py_UNICODE buffer of the given size using UTF-8 and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeUTF8(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using UTF-8 and return the result as Python string object. Error handling is ``strict''. Return NULL if an exception was raised by the codec. 
    object PyUnicode_AsUTF8String(object unicode)

# These are the UTF-16 codec APIs:

    # Decode length bytes from a UTF-16 encoded buffer string and
    # return the corresponding Unicode object. errors (if non-NULL)
    # defines the error handling. It defaults to ``strict''.
    # 
    # If byteorder is non-NULL, the decoder starts decoding using the
    # given byte order:
    #
    #   *byteorder == -1: little endian
    #   *byteorder == 0:  native order
    #   *byteorder == 1:  big endian
    #
    # and then switches if the first two bytes of the input data are a
    # byte order mark (BOM) and the specified byte order is native
    # order. This BOM is not copied into the resulting Unicode
    # string. After completion, *byteorder is set to the current byte
    # order at the.
    #
    # If byteorder is NULL, the codec starts in native order mode.
    object PyUnicode_DecodeUTF16(char *s, Py_ssize_t size, char *errors, int *byteorder)

    # If consumed is NULL, behave like PyUnicode_DecodeUTF16(). If
    # consumed is not NULL, PyUnicode_DecodeUTF16Stateful() will not
    # treat trailing incomplete UTF-16 byte sequences (such as an odd
    # number of bytes or a split surrogate pair) as an error. Those
    # bytes will not be decoded and the number of bytes that have been
    # decoded will be stored in consumed. New in version 2.4.
    object PyUnicode_DecodeUTF16Stateful(char *s, Py_ssize_t size, char *errors, int *byteorder, Py_ssize_t *consumed)

    # Return a Python string object holding the UTF-16 encoded value
    # of the Unicode data in s. If byteorder is not 0, output is
    # written according to the following byte order:
    #
    #   byteorder == -1: little endian
    #   byteorder == 0:  native byte order (writes a BOM mark)
    #   byteorder == 1:  big endian
    #
    # If byteorder is 0, the output string will always start with the
    # Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark
    # is prepended.
    #
    # If Py_UNICODE_WIDE is defined, a single Py_UNICODE value may get
    # represented as a surrogate pair. If it is not defined, each
    # Py_UNICODE values is interpreted as an UCS-2 character.
    object PyUnicode_EncodeUTF16(Py_UNICODE *s, Py_ssize_t size, char *errors, int byteorder)

    # Return a Python string using the UTF-16 encoding in native byte
    # order. The string always starts with a BOM mark. Error handling
    # is ``strict''. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_AsUTF16String(object unicode)

# These are the ``Unicode Escape'' codec APIs:

    # Create a Unicode object by decoding size bytes of the
    # Unicode-Escape encoded string s. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_DecodeUnicodeEscape(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using
    # Unicode-Escape and return a Python string object. Return NULL if
    # an exception was raised by the codec.
    object PyUnicode_EncodeUnicodeEscape(Py_UNICODE *s, Py_ssize_t size)

    # Encode a Unicode objects using Unicode-Escape and return the
    # result as Python string object. Error handling is
    # ``strict''. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsUnicodeEscapeString(object unicode)

# These are the ``Raw Unicode Escape'' codec APIs:

    # Create a Unicode object by decoding size bytes of the
    # Raw-Unicode-Escape encoded string s. Return NULL if an exception
    # was raised by the codec.
    object PyUnicode_DecodeRawUnicodeEscape(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using
    # Raw-Unicode-Escape and return a Python string object. Return
    # NULL if an exception was raised by the codec.
    object PyUnicode_EncodeRawUnicodeEscape(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using Raw-Unicode-Escape and return the
    # result as Python string object. Error handling is
    # ``strict''. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsRawUnicodeEscapeString(object unicode)

# These are the Latin-1 codec APIs: Latin-1 corresponds to the first 256 Unicode ordinals and only these are accepted by the codecs during encoding.

    # Create a Unicode object by decoding size bytes of the Latin-1
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeLatin1(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using Latin-1 and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeLatin1(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using Latin-1 and return the result as
    # Python string object. Error handling is ``strict''. Return NULL
    # if an exception was raised by the codec.
    object PyUnicode_AsLatin1String(object unicode)

# These are the ASCII codec APIs. Only 7-bit ASCII data is
# accepted. All other codes generate errors.

    # Create a Unicode object by decoding size bytes of the ASCII
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeASCII(char *s, Py_ssize_t size, char *errors)

    # Encode the Py_UNICODE buffer of the given size using ASCII and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeASCII(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using ASCII and return the result as
    # Python string object. Error handling is ``strict''. Return NULL
    # if an exception was raised by the codec.
    object PyUnicode_AsASCIIString(object o)

# These are the mapping codec APIs:
#
# This codec is special in that it can be used to implement many
# different codecs (and this is in fact what was done to obtain most
# of the standard codecs included in the encodings package). The codec
# uses mapping to encode and decode characters.
#
# Decoding mappings must map single string characters to single
# Unicode characters, integers (which are then interpreted as Unicode
# ordinals) or None (meaning "undefined mapping" and causing an
# error).
#
# Encoding mappings must map single Unicode characters to single
# string characters, integers (which are then interpreted as Latin-1
# ordinals) or None (meaning "undefined mapping" and causing an
# error).
#
# The mapping objects provided must only support the __getitem__
# mapping interface.
#
# If a character lookup fails with a LookupError, the character is
# copied as-is meaning that its ordinal value will be interpreted as
# Unicode or Latin-1 ordinal resp. Because of this, mappings only need
# to contain those mappings which map characters to different code
# points.

    # Create a Unicode object by decoding size bytes of the encoded
    # string s using the given mapping object. Return NULL if an
    # exception was raised by the codec. If mapping is NULL latin-1
    # decoding will be done. Else it can be a dictionary mapping byte
    # or a unicode string, which is treated as a lookup table. Byte
    # values greater that the length of the string and U+FFFE
    # "characters" are treated as "undefined mapping". Changed in
    # version 2.4: Allowed unicode string as mapping argument.
    object PyUnicode_DecodeCharmap(char *s, Py_ssize_t size, object mapping, char *errors)

    # Encode the Py_UNICODE buffer of the given size using the given
    # mapping object and return a Python string object. Return NULL if
    # an exception was raised by the codec.
    object PyUnicode_EncodeCharmap(Py_UNICODE *s, Py_ssize_t size, object mapping, char *errors)

    # Encode a Unicode objects using the given mapping object and
    # return the result as Python string object. Error handling is
    # ``strict''. Return NULL if an exception was raised by the codec.
    object PyUnicode_AsCharmapString(object o, object mapping)

# The following codec API is special in that maps Unicode to Unicode.

    # Translate a Py_UNICODE buffer of the given length by applying a
    # character mapping table to it and return the resulting Unicode
    # object. Return NULL when an exception was raised by the codec.
    #
    # The mapping table must map Unicode ordinal integers to Unicode
    # ordinal integers or None (causing deletion of the character).
    #
    # Mapping tables need only provide the __getitem__() interface;
    # dictionaries and sequences work well. Unmapped character
    # ordinals (ones which cause a LookupError) are left untouched and
    # are copied as-is.
    object PyUnicode_TranslateCharmap(Py_UNICODE *s, Py_ssize_t size,
                                      object table, char *errors)

# These are the MBCS codec APIs. They are currently only available on
# Windows and use the Win32 MBCS converters to implement the
# conversions. Note that MBCS (or DBCS) is a class of encodings, not
# just one. The target encoding is defined by the user settings on the
# machine running the codec.

    # Create a Unicode object by decoding size bytes of the MBCS
    # encoded string s. Return NULL if an exception was raised by the
    # codec.
    object PyUnicode_DecodeMBCS(char *s, Py_ssize_t size, char *errors)

    # If consumed is NULL, behave like PyUnicode_DecodeMBCS(). If
    # consumed is not NULL, PyUnicode_DecodeMBCSStateful() will not
    # decode trailing lead byte and the number of bytes that have been
    # decoded will be stored in consumed. New in version 2.5.
    object PyUnicode_DecodeMBCSStateful(char *s, int size, char *errors, int *consumed)

    # Encode the Py_UNICODE buffer of the given size using MBCS and
    # return a Python string object. Return NULL if an exception was
    # raised by the codec.
    object PyUnicode_EncodeMBCS(Py_UNICODE *s, Py_ssize_t size, char *errors)

    # Encode a Unicode objects using MBCS and return the result as
    # Python string object. Error handling is ``strict''. Return NULL
    # if an exception was raised by the codec.
    object PyUnicode_AsMBCSString(object o)