Commit cd69a8ad authored by Kirill Smelkov's avatar Kirill Smelkov

unicode/utf8: Start of the package (stub)

We will soon need to use error rune codepoint from both golang_str.pyx
and strconv.pyx - so we need to move that definition into shared place.
What fits best is unicode/utf8, so start that package and move the
constant there.
parent bd662e01
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# cython: binding=False # cython: binding=False
# cython: c_string_type=str, c_string_encoding=utf8 # cython: c_string_type=str, c_string_encoding=utf8
# distutils: language = c++ # distutils: language = c++
# distutils: depends = libgolang.h os/signal.h _golang_str.pyx # distutils: depends = libgolang.h os/signal.h unicode/utf8.h _golang_str.pyx
# #
# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
It is included from _golang.pyx . It is included from _golang.pyx .
""" """
from golang.unicode cimport utf8
from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode
from cpython cimport PyUnicode_DecodeUTF8 from cpython cimport PyUnicode_DecodeUTF8
from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc
...@@ -1873,8 +1875,7 @@ cdef extern from "Python.h": ...@@ -1873,8 +1875,7 @@ cdef extern from "Python.h":
from six import unichr # py2: unichr py3: chr from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,)) from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
cdef int _rune_error = 0xFFFD # unicode replacement character _py_rune_error = utf8.RuneError
_py_rune_error = _rune_error
cdef bint _ucs2_build = (sys.maxunicode == 0xffff) # ucs2 cdef bint _ucs2_build = (sys.maxunicode == 0xffff) # ucs2
assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4 assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4
...@@ -1886,7 +1887,7 @@ def _py_utf8_decode_rune(const byte[::1] s): ...@@ -1886,7 +1887,7 @@ def _py_utf8_decode_rune(const byte[::1] s):
return _utf8_decode_rune(s) return _utf8_decode_rune(s)
cdef (rune, int) _utf8_decode_rune(const byte[::1] s): cdef (rune, int) _utf8_decode_rune(const byte[::1] s):
if len(s) == 0: if len(s) == 0:
return _rune_error, 0 return utf8.RuneError, 0
cdef int l = min(len(s), 4) # max size of an UTF-8 encoded character cdef int l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0: while l > 0:
...@@ -1913,7 +1914,7 @@ cdef (rune, int) _utf8_decode_rune(const byte[::1] s): ...@@ -1913,7 +1914,7 @@ cdef (rune, int) _utf8_decode_rune(const byte[::1] s):
continue continue
# invalid UTF-8 # invalid UTF-8
return _rune_error, 1 return utf8.RuneError, 1
# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3. # _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
...@@ -1932,7 +1933,7 @@ def _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode ...@@ -1932,7 +1933,7 @@ def _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode
while len(s) > 0: while len(s) > 0:
r, width = _utf8_decode_rune(s) r, width = _utf8_decode_rune(s)
if r == _rune_error and width == 1: if r == utf8.RuneError and width == 1:
b = s[0] b = s[0]
assert 0x80 <= b <= 0xff, b assert 0x80 <= b <= 0xff, b
emit(unichr(0xdc00 + b)) emit(unichr(0xdc00 + b))
......
...@@ -226,6 +226,7 @@ def _with_build_defaults(name, kw): # -> (pygo, kw') ...@@ -226,6 +226,7 @@ def _with_build_defaults(name, kw): # -> (pygo, kw')
'os.h', 'os.h',
'os/signal.h', 'os/signal.h',
'pyx/runtime.h', 'pyx/runtime.h',
'unicode/utf8.h',
'_testing.h', '_testing.h',
'_compat/windows/strings.h', '_compat/windows/strings.h',
'_compat/windows/unistd.h', '_compat/windows/unistd.h',
...@@ -274,6 +275,8 @@ def Extension(name, sources, **kw): ...@@ -274,6 +275,8 @@ def Extension(name, sources, **kw):
'os/signal.pxd', 'os/signal.pxd',
'os/_signal.pxd', 'os/_signal.pxd',
'pyx/runtime.pxd', 'pyx/runtime.pxd',
'unicode/utf8.pxd',
'unicode/_utf8.pxd',
]]) ]])
kw['depends'] = dependv kw['depends'] = dependv
......
# cython: language_level=2
# Copyright (C) 2023 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""Package utf8 mirrors Go package utf8.
See https://golang.org/pkg/unicode/utf8 for Go utf8 package documentation.
"""
from golang cimport rune
cdef extern from "golang/unicode/utf8.h" namespace "golang::unicode::utf8" nogil:
rune RuneError
#ifndef _NXD_LIBGOLANG_UNICODE_UTF8_H
#define _NXD_LIBGOLANG_UNICODE_UTF8_H
// Copyright (C) 2023 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package utf8 mirrors Go package utf8.
#include <golang/libgolang.h>
// golang::unicode::utf8::
namespace golang {
namespace unicode {
namespace utf8 {
constexpr rune RuneError = 0xFFFD; // unicode replacement character
}}} // golang::os::utf8::
#endif // _NXD_LIBGOLANG_UNICODE_UTF8_H
# cython: language_level=2
# Copyright (C) 2023 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""Package utf8 mirrors Go package utf8.
See _utf8.pxd for package documentation.
"""
# redirect cimport: golang.unicode.utf8 -> golang.unicode._utf8 (see __init__.pxd for rationale)
from golang.unicode._utf8 cimport *
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment