Commit cd69a8ad authored by Kirill Smelkov's avatar Kirill Smelkov

unicode/utf8: Start of the package (stub)

We will soon need to use error rune codepoint from both golang_str.pyx
and strconv.pyx - so we need to move that definition into shared place.
What fits best is unicode/utf8, so start that package and move the
constant there.
parent bd662e01
......@@ -3,7 +3,7 @@
# cython: binding=False
# cython: c_string_type=str, c_string_encoding=utf8
# distutils: language = c++
# distutils: depends = libgolang.h os/signal.h _golang_str.pyx
# distutils: depends = libgolang.h os/signal.h unicode/utf8.h _golang_str.pyx
#
# Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
......
......@@ -22,6 +22,8 @@
It is included from _golang.pyx .
"""
from golang.unicode cimport utf8
from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode
from cpython cimport PyUnicode_DecodeUTF8
from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc
......@@ -1873,8 +1875,7 @@ cdef extern from "Python.h":
from six import unichr # py2: unichr py3: chr
from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,))
cdef int _rune_error = 0xFFFD # unicode replacement character
_py_rune_error = _rune_error
_py_rune_error = utf8.RuneError
cdef bint _ucs2_build = (sys.maxunicode == 0xffff) # ucs2
assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4
......@@ -1886,7 +1887,7 @@ def _py_utf8_decode_rune(const byte[::1] s):
return _utf8_decode_rune(s)
cdef (rune, int) _utf8_decode_rune(const byte[::1] s):
if len(s) == 0:
return _rune_error, 0
return utf8.RuneError, 0
cdef int l = min(len(s), 4) # max size of an UTF-8 encoded character
while l > 0:
......@@ -1913,7 +1914,7 @@ cdef (rune, int) _utf8_decode_rune(const byte[::1] s):
continue
# invalid UTF-8
return _rune_error, 1
return utf8.RuneError, 1
# _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3.
......@@ -1932,7 +1933,7 @@ def _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode
while len(s) > 0:
r, width = _utf8_decode_rune(s)
if r == _rune_error and width == 1:
if r == utf8.RuneError and width == 1:
b = s[0]
assert 0x80 <= b <= 0xff, b
emit(unichr(0xdc00 + b))
......
......@@ -226,6 +226,7 @@ def _with_build_defaults(name, kw): # -> (pygo, kw')
'os.h',
'os/signal.h',
'pyx/runtime.h',
'unicode/utf8.h',
'_testing.h',
'_compat/windows/strings.h',
'_compat/windows/unistd.h',
......@@ -274,6 +275,8 @@ def Extension(name, sources, **kw):
'os/signal.pxd',
'os/_signal.pxd',
'pyx/runtime.pxd',
'unicode/utf8.pxd',
'unicode/_utf8.pxd',
]])
kw['depends'] = dependv
......
# cython: language_level=2
# Copyright (C) 2023 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""Package utf8 mirrors Go package utf8.
See https://golang.org/pkg/unicode/utf8 for Go utf8 package documentation.
"""
from golang cimport rune
cdef extern from "golang/unicode/utf8.h" namespace "golang::unicode::utf8" nogil:
rune RuneError
#ifndef _NXD_LIBGOLANG_UNICODE_UTF8_H
#define _NXD_LIBGOLANG_UNICODE_UTF8_H
// Copyright (C) 2023 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package utf8 mirrors Go package utf8.
#include <golang/libgolang.h>
// golang::unicode::utf8::
namespace golang {
namespace unicode {
namespace utf8 {
constexpr rune RuneError = 0xFFFD; // unicode replacement character
}}} // golang::os::utf8::
#endif // _NXD_LIBGOLANG_UNICODE_UTF8_H
# cython: language_level=2
# Copyright (C) 2023 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""Package utf8 mirrors Go package utf8.
See _utf8.pxd for package documentation.
"""
# redirect cimport: golang.unicode.utf8 -> golang.unicode._utf8 (see __init__.pxd for rationale)
from golang.unicode._utf8 cimport *
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment