Commit 7012673d authored by Marc-André Lemburg's avatar Marc-André Lemburg

Extending the encoding name normalization to handle more non-alphanumeric

characters.
parent 399a6890
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
Standard Python encoding modules are stored in this package Standard Python encoding modules are stored in this package
directory. directory.
Codec modules must have names corresponding to standard lower-case Codec modules must have names corresponding to normalized encoding
encoding names with hyphens mapped to underscores, e.g. 'utf-8' is names as defined in the normalize_encoding() function below, e.g.
implemented by the module 'utf_8.py'. 'utf-8' must be implemented by the module 'utf_8.py'.
Each codec module must export the following interface: Each codec module must export the following interface:
...@@ -18,9 +18,8 @@ ...@@ -18,9 +18,8 @@
* getaliases() -> sequence of encoding name strings to use as aliases * getaliases() -> sequence of encoding name strings to use as aliases
Alias names returned by getaliases() must be standard encoding Alias names returned by getaliases() must be normalized encoding
names as defined above (lower-case, hyphens converted to names as defined by normalize_encoding().
underscores).
Written by Marc-Andre Lemburg (mal@lemburg.com). Written by Marc-Andre Lemburg (mal@lemburg.com).
...@@ -28,16 +27,29 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). ...@@ -28,16 +27,29 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
"""#" """#"
import codecs,exceptions import codecs, exceptions, re
_cache = {} _cache = {}
_unknown = '--unknown--' _unknown = '--unknown--'
_import_tail = ['*'] _import_tail = ['*']
_norm_encoding_RE = re.compile('[^a-zA-Z0-9.]')
class CodecRegistryError(exceptions.LookupError, class CodecRegistryError(exceptions.LookupError,
exceptions.SystemError): exceptions.SystemError):
pass pass
def normalize_encoding(encoding):
""" Normalize an encoding name.
Normalization works as follows: all non-alphanumeric
characters except the dot used for Python package names are
collapsed and replaced with a single underscore, e.g. ' -;#'
becomes '_'.
"""
return '_'.join(_norm_encoding_RE.split(encoding))
def search_function(encoding): def search_function(encoding):
# Cache lookup # Cache lookup
...@@ -51,7 +63,7 @@ def search_function(encoding): ...@@ -51,7 +63,7 @@ def search_function(encoding):
# encoding in the aliases mapping and retry the import using the # encoding in the aliases mapping and retry the import using the
# default import module lookup scheme with the alias name. # default import module lookup scheme with the alias name.
# #
modname = encoding.replace('-', '_') modname = normalize_encoding(encoding)
try: try:
mod = __import__('encodings.' + modname, mod = __import__('encodings.' + modname,
globals(), locals(), _import_tail) globals(), locals(), _import_tail)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment