Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
2810dd7b
Commit
2810dd7b
authored
Nov 04, 2018
by
Max Bélanger
Committed by
Benjamin Peterson
Nov 04, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
closes bpo-32285: Add unicodedata.is_normalized. (GH-4806)
parent
5d236caf
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
160 additions
and
22 deletions
+160
-22
Doc/library/unicodedata.rst
Doc/library/unicodedata.rst
+7
-0
Doc/whatsnew/3.8.rst
Doc/whatsnew/3.8.rst
+7
-0
Lib/test/test_normalization.py
Lib/test/test_normalization.py
+10
-1
Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
...ore and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
+2
-0
Modules/clinic/unicodedata.c.h
Modules/clinic/unicodedata.c.h
+36
-4
Modules/unicodedata.c
Modules/unicodedata.c
+98
-17
No files found.
Doc/library/unicodedata.rst
View file @
2810dd7b
...
...
@@ -133,6 +133,13 @@ following functions:
a human reader, if one has combining characters and the other
doesn't, they may not compare equal.
.. function:: is_normalized(form, unistr)
Return whether the Unicode string *unistr* is in the normal form *form*. Valid
values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
.. versionadded:: 3.8
In addition, the module exposes the following constant:
...
...
Doc/whatsnew/3.8.rst
View file @
2810dd7b
...
...
@@ -204,6 +204,13 @@ Added method :meth:`~tkinter.Canvas.moveto`
in the :class:`tkinter.Canvas` class.
(Contributed by Juliette Monsel in :issue:`23831`.)
unicodedata
-----------
* New function :func:`~unicodedata.is_normalized` can be used to verify a string
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
:issue:`32285`).
venv
----
...
...
Lib/test/test_normalization.py
View file @
2810dd7b
...
...
@@ -3,7 +3,7 @@ import unittest
from
http.client
import
HTTPException
import
sys
from
unicodedata
import
normalize
,
unidata_version
from
unicodedata
import
normalize
,
is_normalized
,
unidata_version
TESTDATAFILE
=
"NormalizationTest.txt"
TESTDATAURL
=
"http://www.pythontest.net/unicode/"
+
unidata_version
+
"/"
+
TESTDATAFILE
...
...
@@ -88,6 +88,15 @@ class NormalizationTest(unittest.TestCase):
NFKD
(
c3
)
==
NFKD
(
c4
)
==
NFKD
(
c5
),
line
)
self
.
assertTrue
(
is_normalized
(
"NFC"
,
c2
))
self
.
assertTrue
(
is_normalized
(
"NFC"
,
c4
))
self
.
assertTrue
(
is_normalized
(
"NFD"
,
c3
))
self
.
assertTrue
(
is_normalized
(
"NFD"
,
c5
))
self
.
assertTrue
(
is_normalized
(
"NFKC"
,
c4
))
self
.
assertTrue
(
is_normalized
(
"NFKD"
,
c5
))
# Record part 1 data
if
part
==
"@Part1"
:
part1_data
[
c1
]
=
1
...
...
Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst
0 → 100644
View file @
2810dd7b
New function unicodedata.is_normalized, which can check whether a string is
in a specific normal form.
Modules/clinic/unicodedata.c.h
View file @
2810dd7b
...
...
@@ -284,6 +284,38 @@ exit:
return
return_value
;
}
PyDoc_STRVAR
(
unicodedata_UCD_is_normalized__doc__
,
"is_normalized($self, form, unistr, /)
\n
"
"--
\n
"
"
\n
"
"Return whether the Unicode string unistr is in the normal form
\'
form
\'
.
\n
"
"
\n
"
"Valid values for form are
\'
NFC
\'
,
\'
NFKC
\'
,
\'
NFD
\'
, and
\'
NFKD
\'
."
);
#define UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF \
{"is_normalized", (PyCFunction)unicodedata_UCD_is_normalized, METH_FASTCALL, unicodedata_UCD_is_normalized__doc__},
static
PyObject
*
unicodedata_UCD_is_normalized_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
);
static
PyObject
*
unicodedata_UCD_is_normalized
(
PyObject
*
self
,
PyObject
*
const
*
args
,
Py_ssize_t
nargs
)
{
PyObject
*
return_value
=
NULL
;
PyObject
*
form
;
PyObject
*
input
;
if
(
!
_PyArg_ParseStack
(
args
,
nargs
,
"UU:is_normalized"
,
&
form
,
&
input
))
{
goto
exit
;
}
return_value
=
unicodedata_UCD_is_normalized_impl
(
self
,
form
,
input
);
exit:
return
return_value
;
}
PyDoc_STRVAR
(
unicodedata_UCD_normalize__doc__
,
"normalize($self, form, unistr, /)
\n
"
"--
\n
"
...
...
@@ -296,17 +328,17 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
{"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__},
static
PyObject
*
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
const
char
*
form
,
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
);
static
PyObject
*
unicodedata_UCD_normalize
(
PyObject
*
self
,
PyObject
*
const
*
args
,
Py_ssize_t
nargs
)
{
PyObject
*
return_value
=
NULL
;
const
char
*
form
;
PyObject
*
form
;
PyObject
*
input
;
if
(
!
_PyArg_ParseStack
(
args
,
nargs
,
"
s
U:normalize"
,
if
(
!
_PyArg_ParseStack
(
args
,
nargs
,
"
U
U:normalize"
,
&
form
,
&
input
))
{
goto
exit
;
}
...
...
@@ -379,4 +411,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
exit:
return
return_value
;
}
/*[clinic end generated code: output=
dc899bff0ecd14c1
input=a9049054013a1b77]*/
/*[clinic end generated code: output=
2c5fbf597c18f6b8
input=a9049054013a1b77]*/
Modules/unicodedata.c
View file @
2810dd7b
...
...
@@ -19,6 +19,11 @@
#include "ucnhash.h"
#include "structmember.h"
_Py_IDENTIFIER
(
NFC
);
_Py_IDENTIFIER
(
NFD
);
_Py_IDENTIFIER
(
NFKC
);
_Py_IDENTIFIER
(
NFKD
);
/*[clinic input]
module unicodedata
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
...
...
@@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return
result
;
}
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
static
int
typedef
enum
{
YES
,
NO
,
MAYBE
}
NormalMode
;
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
static
NormalMode
is_normalized
(
PyObject
*
self
,
PyObject
*
input
,
int
nfc
,
int
k
)
{
Py_ssize_t
i
,
len
;
...
...
@@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
/* An older version of the database is requested, quickchecks must be
disabled. */
if
(
self
&&
UCD_Check
(
self
))
return
0
;
return
NO
;
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
...
...
@@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
unsigned
char
quickcheck
=
record
->
normalization_quick_check
;
if
(
quickcheck
&
quickcheck_mask
)
return
0
;
/* this string might need normalization */
return
MAYBE
;
/* this string might need normalization */
if
(
combining
&&
prev_combining
>
combining
)
return
0
;
/* non-canonical sort order, not normalized */
return
NO
;
/* non-canonical sort order, not normalized */
prev_combining
=
combining
;
}
return
1
;
/* certainly normalized */
return
YES
;
/* certainly normalized */
}
/*[clinic input]
unicodedata.UCD.is_normalized
self: self
form: unicode
unistr as input: unicode
/
Return whether the Unicode string unistr is in the normal form 'form'.
Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/
static
PyObject
*
unicodedata_UCD_is_normalized_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
)
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
{
if
(
PyUnicode_READY
(
input
)
==
-
1
)
{
return
NULL
;
}
if
(
PyUnicode_GET_LENGTH
(
input
)
==
0
)
{
/* special case empty input strings. */
Py_RETURN_TRUE
;
}
PyObject
*
result
;
int
nfc
=
0
;
int
k
=
0
;
NormalMode
m
;
PyObject
*
cmp
;
int
match
=
0
;
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFC
))
{
nfc
=
1
;
}
else
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKC
))
{
nfc
=
1
;
k
=
1
;
}
else
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFD
))
{
/* matches default values for `nfc` and `k` */
}
else
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKD
))
{
k
=
1
;
}
else
{
PyErr_SetString
(
PyExc_ValueError
,
"invalid normalization form"
);
return
NULL
;
}
m
=
is_normalized
(
self
,
input
,
nfc
,
k
);
if
(
m
==
MAYBE
)
{
cmp
=
(
nfc
?
nfc_nfkc
:
nfd_nfkd
)(
self
,
input
,
k
);
if
(
cmp
==
NULL
)
{
return
NULL
;
}
match
=
PyUnicode_Compare
(
input
,
cmp
);
Py_DECREF
(
cmp
);
result
=
(
match
==
0
)
?
Py_True
:
Py_False
;
}
else
{
result
=
(
m
==
YES
)
?
Py_True
:
Py_False
;
}
Py_INCREF
(
result
);
return
result
;
}
/*[clinic input]
unicodedata.UCD.normalize
self: self
form:
str
form:
unicode
unistr as input: unicode
/
...
...
@@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
[clinic start generated code]*/
static
PyObject
*
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
const
char
*
form
,
unicodedata_UCD_normalize_impl
(
PyObject
*
self
,
PyObject
*
form
,
PyObject
*
input
)
/*[clinic end generated code: output=
62d1f8870027efdc input=1744c55f4ab79bf0
]*/
/*[clinic end generated code: output=
05ca4385a2ad6983 input=3a5206c0ad2833fb
]*/
{
if
(
PyUnicode_GET_LENGTH
(
input
)
==
0
)
{
/* Special case empty input strings, since resizing
...
...
@@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
return
input
;
}
if
(
strcmp
(
form
,
"NFC"
)
==
0
)
{
if
(
is_normalized
(
self
,
input
,
1
,
0
))
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFC
)
)
{
if
(
is_normalized
(
self
,
input
,
1
,
0
)
==
YES
)
{
Py_INCREF
(
input
);
return
input
;
}
return
nfc_nfkc
(
self
,
input
,
0
);
}
if
(
strcmp
(
form
,
"NFKC"
)
==
0
)
{
if
(
is_normalized
(
self
,
input
,
1
,
1
))
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKC
)
)
{
if
(
is_normalized
(
self
,
input
,
1
,
1
)
==
YES
)
{
Py_INCREF
(
input
);
return
input
;
}
return
nfc_nfkc
(
self
,
input
,
1
);
}
if
(
strcmp
(
form
,
"NFD"
)
==
0
)
{
if
(
is_normalized
(
self
,
input
,
0
,
0
))
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFD
)
)
{
if
(
is_normalized
(
self
,
input
,
0
,
0
)
==
YES
)
{
Py_INCREF
(
input
);
return
input
;
}
return
nfd_nfkd
(
self
,
input
,
0
);
}
if
(
strcmp
(
form
,
"NFKD"
)
==
0
)
{
if
(
is_normalized
(
self
,
input
,
0
,
1
))
{
if
(
_PyUnicode_EqualToASCIIId
(
form
,
&
PyId_NFKD
)
)
{
if
(
is_normalized
(
self
,
input
,
0
,
1
)
==
YES
)
{
Py_INCREF
(
input
);
return
input
;
}
...
...
@@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
UNICODEDATA_UCD_NAME_METHODDEF
UNICODEDATA_UCD_LOOKUP_METHODDEF
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
UNICODEDATA_UCD_NORMALIZE_METHODDEF
{
NULL
,
NULL
}
/* sentinel */
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment