Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
c60e6f77
Commit
c60e6f77
authored
Sep 20, 2001
by
Marc-André Lemburg
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Patch #435971: UTF-7 codec by Brian Quinlan.
parent
26e3b681
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
392 additions
and
1 deletion
+392
-1
Include/unicodeobject.h
Include/unicodeobject.h
+18
-0
Lib/encodings/aliases.py
Lib/encodings/aliases.py
+4
-0
Lib/test/test_unicode.py
Lib/test/test_unicode.py
+28
-1
Modules/_codecsmodule.c
Modules/_codecsmodule.c
+42
-0
Objects/unicodeobject.c
Objects/unicodeobject.c
+300
-0
No files found.
Include/unicodeobject.h
View file @
c60e6f77
...
...
@@ -607,6 +607,24 @@ extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
const
char
*
errors
/* error handling */
);
/* --- UTF-7 Codecs ------------------------------------------------------- */
extern
DL_IMPORT
(
PyObject
*
)
PyUnicode_DecodeUTF7
(
const
char
*
string
,
/* UTF-7 encoded string */
int
length
,
/* size of string */
const
char
*
errors
/* error handling */
);
extern
DL_IMPORT
(
PyObject
*
)
PyUnicode_EncodeUTF7
(
const
Py_UNICODE
*
data
,
/* Unicode char buffer */
int
length
,
/* number of Py_UNICODE chars to encode */
int
encodeSetO
,
/* force the encoder to encode characters in
Set O, as described in RFC2152 */
int
encodeWhiteSpace
,
/* force the encoder to encode space, tab,
carriage return and linefeed characters */
const
char
*
errors
/* error handling */
);
/* --- UTF-8 Codecs ------------------------------------------------------- */
extern
DL_IMPORT
(
PyObject
*
)
PyUnicode_DecodeUTF8
(
...
...
Lib/encodings/aliases.py
View file @
c60e6f77
...
...
@@ -14,6 +14,10 @@ aliases = {
'latin'
:
'latin_1'
,
'latin1'
:
'latin_1'
,
# UTF-7
'utf7'
:
'utf_7'
,
'u7'
:
'utf_7'
,
# UTF-8
'utf'
:
'utf_8'
,
'utf8'
:
'utf_8'
,
...
...
Lib/test/test_unicode.py
View file @
c60e6f77
...
...
@@ -377,6 +377,32 @@ print 'done.'
# Test builtin codecs
print
'Testing builtin codecs...'
,
# UTF-7 specific encoding tests:
utfTests
=
[(
u'A
\
u2262
\
u0391
.'
,
'A+ImIDkQ.'
),
# RFC2152 example
(
u'Hi Mom -
\
u263a
-!'
,
'Hi Mom -+Jjo--!'
),
# RFC2152 example
(
u'
\
u65E5
\
u672C
\
u8A9E
'
,
'+ZeVnLIqe-'
),
# RFC2152 example
(
u'Item 3 is
\
u00a3
1.'
,
'Item 3 is +AKM-1.'
),
# RFC2152 example
(
u'+'
,
'+-'
),
(
u'+-'
,
'+--'
),
(
u'+?'
,
'+-?'
),
(
u'
\
?
'
, '
+
AFw
?
'),
(u'
+
?
', '
+-
?
'),
(ur'
\\
?
', '
+
AFwAXA
?
'),
(ur'
\\\
?
', '
+
AFwAXABc
?
'),
(ur'
++--
', '
+-+---
')]
for x,y in utfTests:
verify( x.encode('
utf
-
7
') == y )
try:
unicode('
+
3
ADYAA
-
', '
utf
-
7
') # surrogates not supported
except UnicodeError:
pass
else:
raise TestFailed, "unicode('
+
3
ADYAA
-
', '
utf
-
7
') failed to raise an exception"
verify(unicode('
+
3
ADYAA
-
', '
utf
-
7
', '
replace
') == u'
\
ufffd
')
# UTF-8 specific encoding tests:
verify(u'
\
u20ac
'.encode('
utf
-
8
') ==
\
''.join((chr(0xe2), chr(0x82), chr(0xac))) )
...
...
@@ -439,6 +465,7 @@ verify(unicode('Andr\202 x','ascii','ignore') == u"Andr x")
verify(unicode('
Andr
\
202
x
','
ascii
','
replace
') == u'
Andr
\
uFFFD
x
')
verify(u'
hello
'.encode('
ascii
') == '
hello
')
verify(u'
hello
'.encode('
utf
-
7
') == '
hello
')
verify(u'
hello
'.encode('
utf
-
8
') == '
hello
')
verify(u'
hello
'.encode('
utf8
') == '
hello
')
verify(u'
hello
'.encode('
utf
-
16
-
le
') == '
h
\
000
e
\
000
l
\
000
l
\
000
o
\
000
')
...
...
@@ -447,7 +474,7 @@ verify(u'hello'.encode('latin-1') == 'hello')
# Roundtrip safety for BMP (just the first 1024 chars)
u = u''.join(map(unichr, range(1024)))
for
encoding
in
(
'utf-8'
,
'utf-16'
,
'utf-16-le'
,
'utf-16-be'
,
for encoding in ('
utf
-
7
', '
utf
-
8
', '
utf
-
16
', '
utf
-
16
-
le
', '
utf
-
16
-
be
',
'
raw_unicode_escape
', '
unicode_escape
', '
unicode_internal
'):
verify(unicode(u.encode(encoding),encoding) == u)
...
...
Modules/_codecsmodule.c
View file @
c60e6f77
...
...
@@ -123,6 +123,22 @@ unicode_internal_decode(PyObject *self,
}
}
static
PyObject
*
utf_7_decode
(
PyObject
*
self
,
PyObject
*
args
)
{
const
char
*
data
;
int
size
;
const
char
*
errors
=
NULL
;
if
(
!
PyArg_ParseTuple
(
args
,
"t#|z:utf_7_decode"
,
&
data
,
&
size
,
&
errors
))
return
NULL
;
return
codec_tuple
(
PyUnicode_DecodeUTF7
(
data
,
size
,
errors
),
size
);
}
static
PyObject
*
utf_8_decode
(
PyObject
*
self
,
PyObject
*
args
)
...
...
@@ -381,6 +397,30 @@ unicode_internal_encode(PyObject *self,
}
}
static
PyObject
*
utf_7_encode
(
PyObject
*
self
,
PyObject
*
args
)
{
PyObject
*
str
,
*
v
;
const
char
*
errors
=
NULL
;
if
(
!
PyArg_ParseTuple
(
args
,
"O|z:utf_7_encode"
,
&
str
,
&
errors
))
return
NULL
;
str
=
PyUnicode_FromObject
(
str
);
if
(
str
==
NULL
)
return
NULL
;
v
=
codec_tuple
(
PyUnicode_EncodeUTF7
(
PyUnicode_AS_UNICODE
(
str
),
PyUnicode_GET_SIZE
(
str
),
0
,
0
,
errors
),
PyUnicode_GET_SIZE
(
str
));
Py_DECREF
(
str
);
return
v
;
}
static
PyObject
*
utf_8_encode
(
PyObject
*
self
,
PyObject
*
args
)
...
...
@@ -632,6 +672,8 @@ static PyMethodDef _codecs_functions[] = {
#ifdef Py_USING_UNICODE
{
"utf_8_encode"
,
utf_8_encode
,
1
},
{
"utf_8_decode"
,
utf_8_decode
,
1
},
{
"utf_7_encode"
,
utf_7_encode
,
1
},
{
"utf_7_decode"
,
utf_7_decode
,
1
},
{
"utf_16_encode"
,
utf_16_encode
,
1
},
{
"utf_16_le_encode"
,
utf_16_le_encode
,
1
},
{
"utf_16_be_encode"
,
utf_16_be_encode
,
1
},
...
...
Objects/unicodeobject.c
View file @
c60e6f77
...
...
@@ -635,6 +635,306 @@ int PyUnicode_SetDefaultEncoding(const char *encoding)
return
-
1
;
}
/* --- UTF-7 Codec -------------------------------------------------------- */
/* see RFC2152 for details */
static
char
utf7_special
[
128
]
=
{
/* indicate whether a UTF-7 character is special i.e. cannot be directly
encoded:
0 - not special
1 - special
2 - whitespace (optional)
3 - RFC2152 Set O (optional) */
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
2
,
1
,
1
,
2
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
2
,
3
,
3
,
3
,
3
,
3
,
3
,
0
,
0
,
0
,
3
,
1
,
0
,
0
,
0
,
1
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
3
,
3
,
3
,
3
,
0
,
3
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
3
,
1
,
3
,
3
,
3
,
3
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
3
,
3
,
3
,
1
,
1
,
};
#define SPECIAL(c, encodeO, encodeWS) \
(((c)>127 || utf7_special[(c)] == 1) || \
(encodeWS && (utf7_special[(c)] == 2)) || \
(encodeO && (utf7_special[(c)] == 3)))
#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
#define ENCODE(out, ch, bits) \
while (bits >= 6) { \
*out++ = B64(ch >> (bits-6)); \
bits -= 6; \
}
#define DECODE(out, ch, bits, surrogate) \
while (bits >= 16) { \
Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
bits -= 16; \
if (surrogate) { \
/* We have already generated an error for the high surrogate
so let's not bother seeing if the low surrogate is correct or not */
\
surrogate = 0; \
} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
/* This is a surrogate pair. Unfortunately we can't represent \
it in a 16-bit character */
\
surrogate = 1; \
errmsg = "code pairs are not supported"; \
goto utf7Error; \
} else { \
*out++ = outCh; \
} \
} \
static
int
utf7_decoding_error
(
Py_UNICODE
**
dest
,
const
char
*
errors
,
const
char
*
details
)
{
if
((
errors
==
NULL
)
||
(
strcmp
(
errors
,
"strict"
)
==
0
))
{
PyErr_Format
(
PyExc_UnicodeError
,
"UTF-7 decoding error: %.400s"
,
details
);
return
-
1
;
}
else
if
(
strcmp
(
errors
,
"ignore"
)
==
0
)
{
return
0
;
}
else
if
(
strcmp
(
errors
,
"replace"
)
==
0
)
{
if
(
dest
!=
NULL
)
{
**
dest
=
Py_UNICODE_REPLACEMENT_CHARACTER
;
(
*
dest
)
++
;
}
return
0
;
}
else
{
PyErr_Format
(
PyExc_ValueError
,
"UTF-7 decoding error; unknown error handling code: %.400s"
,
errors
);
return
-
1
;
}
}
PyObject
*
PyUnicode_DecodeUTF7
(
const
char
*
s
,
int
size
,
const
char
*
errors
)
{
const
char
*
e
;
PyUnicodeObject
*
unicode
;
Py_UNICODE
*
p
;
const
char
*
errmsg
=
""
;
int
inShift
=
0
;
unsigned
int
bitsleft
=
0
;
unsigned
long
charsleft
=
0
;
int
surrogate
=
0
;
unicode
=
_PyUnicode_New
(
size
);
if
(
!
unicode
)
return
NULL
;
if
(
size
==
0
)
return
(
PyObject
*
)
unicode
;
p
=
unicode
->
str
;
e
=
s
+
size
;
while
(
s
<
e
)
{
Py_UNICODE
ch
=
*
s
;
if
(
inShift
)
{
if
((
ch
==
'-'
)
||
!
B64CHAR
(
ch
))
{
inShift
=
0
;
s
++
;
/* p, charsleft, bitsleft, surrogate = */
DECODE
(
p
,
charsleft
,
bitsleft
,
surrogate
);
if
(
bitsleft
>=
6
)
{
/* The shift sequence has a partial character in it. If
bitsleft < 6 then we could just classify it as padding
but that is not the case here */
errmsg
=
"partial character in shift sequence"
;
goto
utf7Error
;
}
/* According to RFC2152 the remaining bits should be zero. We
choose to signal an error/insert a replacement character
here so indicate the potential of a misencoded character. */
/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
if
(
bitsleft
&&
charsleft
<<
(
sizeof
(
charsleft
)
*
8
-
bitsleft
))
{
errmsg
=
"non-zero padding bits in shift sequence"
;
goto
utf7Error
;
}
if
(
ch
==
'-'
)
{
if
((
s
<
e
)
&&
(
*
(
s
)
==
'-'
))
{
*
p
++
=
'-'
;
inShift
=
1
;
}
}
else
if
(
SPECIAL
(
ch
,
0
,
0
))
{
errmsg
=
"unexpected special character"
;
goto
utf7Error
;
}
else
{
*
p
++
=
ch
;
}
}
else
{
charsleft
=
(
charsleft
<<
6
)
|
UB64
(
ch
);
bitsleft
+=
6
;
s
++
;
/* p, charsleft, bitsleft, surrogate = */
DECODE
(
p
,
charsleft
,
bitsleft
,
surrogate
);
}
}
else
if
(
ch
==
'+'
)
{
s
++
;
if
(
s
<
e
&&
*
s
==
'-'
)
{
s
++
;
*
p
++
=
'+'
;
}
else
{
inShift
=
1
;
bitsleft
=
0
;
}
}
else
if
(
SPECIAL
(
ch
,
0
,
0
))
{
errmsg
=
"unexpected special character"
;
s
++
;
goto
utf7Error
;
}
else
{
*
p
++
=
ch
;
s
++
;
}
continue
;
utf7Error:
if
(
utf7_decoding_error
(
&
p
,
errors
,
errmsg
))
goto
onError
;
}
if
(
inShift
)
{
if
(
utf7_decoding_error
(
&
p
,
errors
,
"unterminated shift sequence"
))
goto
onError
;
}
if
(
_PyUnicode_Resize
(
&
unicode
,
p
-
unicode
->
str
))
goto
onError
;
return
(
PyObject
*
)
unicode
;
onError:
Py_DECREF
(
unicode
);
return
NULL
;
}
PyObject
*
PyUnicode_EncodeUTF7
(
const
Py_UNICODE
*
s
,
int
size
,
int
encodeSetO
,
int
encodeWhiteSpace
,
const
char
*
errors
)
{
PyObject
*
v
;
/* It might be possible to tighten this worst case */
unsigned
int
cbAllocated
=
5
*
size
;
int
inShift
=
0
;
int
i
=
0
;
unsigned
int
bitsleft
=
0
;
unsigned
long
charsleft
=
0
;
char
*
out
;
char
*
start
;
if
(
size
==
0
)
return
PyString_FromStringAndSize
(
NULL
,
0
);
v
=
PyString_FromStringAndSize
(
NULL
,
cbAllocated
);
if
(
v
==
NULL
)
return
NULL
;
start
=
out
=
PyString_AS_STRING
(
v
);
for
(;
i
<
size
;
++
i
)
{
Py_UNICODE
ch
=
s
[
i
];
if
(
!
inShift
)
{
if
(
ch
==
'+'
)
{
*
out
++
=
'+'
;
*
out
++
=
'-'
;
}
else
if
(
SPECIAL
(
ch
,
encodeSetO
,
encodeWhiteSpace
))
{
charsleft
=
ch
;
bitsleft
=
16
;
*
out
++
=
'+'
;
/* out, charsleft, bitsleft = */
ENCODE
(
out
,
charsleft
,
bitsleft
);
inShift
=
bitsleft
>
0
;
}
else
{
*
out
++
=
(
char
)
ch
;
}
}
else
{
if
(
!
SPECIAL
(
ch
,
encodeSetO
,
encodeWhiteSpace
))
{
*
out
++
=
B64
(
charsleft
<<
(
6
-
bitsleft
));
charsleft
=
0
;
bitsleft
=
0
;
/* Characters not in the BASE64 set implicitly unshift the sequence
so no '-' is required, except if the character is itself a '-' */
if
(
B64CHAR
(
ch
)
||
ch
==
'-'
)
{
*
out
++
=
'-'
;
}
inShift
=
0
;
*
out
++
=
(
char
)
ch
;
}
else
{
bitsleft
+=
16
;
charsleft
=
(
charsleft
<<
16
)
|
ch
;
/* out, charsleft, bitsleft = */
ENCODE
(
out
,
charsleft
,
bitsleft
);
/* If the next character is special then we dont' need to terminate
the shift sequence. If the next character is not a BASE64 character
or '-' then the shift sequence will be terminated implicitly and we
don't have to insert a '-'. */
if
(
bitsleft
==
0
)
{
if
(
i
+
1
<
size
)
{
Py_UNICODE
ch2
=
s
[
i
+
1
];
if
(
SPECIAL
(
ch2
,
encodeSetO
,
encodeWhiteSpace
))
{
}
else
if
(
B64CHAR
(
ch2
)
||
ch2
==
'-'
)
{
*
out
++
=
'-'
;
inShift
=
0
;
}
else
{
inShift
=
0
;
}
}
else
{
*
out
++
=
'-'
;
inShift
=
0
;
}
}
}
}
}
if
(
bitsleft
)
{
*
out
++=
B64
(
charsleft
<<
(
6
-
bitsleft
)
);
*
out
++
=
'-'
;
}
if
(
_PyString_Resize
(
&
v
,
out
-
start
))
{
Py_DECREF
(
v
);
return
NULL
;
}
return
v
;
}
#undef SPECIAL
#undef B64
#undef B64CHAR
#undef UB64
#undef ENCODE
#undef DECODE
/* --- UTF-8 Codec -------------------------------------------------------- */
static
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment