Commit 78e2f06c authored by Martin v. Löwis's avatar Martin v. Löwis

Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.

parent 53d93adc
...@@ -16,7 +16,10 @@ from sre_constants import * ...@@ -16,7 +16,10 @@ from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch" assert _sre.MAGIC == MAGIC, "SRE module mismatch"
MAXCODE = 65535 if _sre.CODESIZE == 2:
MAXCODE = 65535
else:
MAXCODE = 0xFFFFFFFFL
def _compile(code, pattern, flags): def _compile(code, pattern, flags):
# internal: compile a (sub)pattern # internal: compile a (sub)pattern
...@@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup): ...@@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
# XXX: could append to charmap tail # XXX: could append to charmap tail
return charset # cannot compress return charset # cannot compress
except IndexError: except IndexError:
if sys.maxunicode != 65535:
# XXX: big charsets don't work in UCS-4 builds
return charset
# character set contains unicode characters # character set contains unicode characters
return _optimize_unicode(charset, fixup) return _optimize_unicode(charset, fixup)
# compress character map # compress character map
...@@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup): ...@@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
def _mk_bitmap(bits): def _mk_bitmap(bits):
data = [] data = []
m = 1; v = 0 if _sre.CODESIZE == 2:
start = (1, 0)
else:
start = (1L, 0L)
m, v = start
for c in bits: for c in bits:
if c: if c:
v = v + m v = v + m
m = m << 1 m = m << 1
if m > MAXCODE: if m > MAXCODE:
data.append(v) data.append(v)
m = 1; v = 0 m, v = start
return data return data
# To represent a big charset, first a bitmap of all characters in the # To represent a big charset, first a bitmap of all characters in the
...@@ -258,21 +262,38 @@ def _mk_bitmap(bits): ...@@ -258,21 +262,38 @@ def _mk_bitmap(bits):
# less significant byte is a bit index in the chunk (just like the # less significant byte is a bit index in the chunk (just like the
# CHARSET matching). # CHARSET matching).
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of UTF-16 has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.
def _optimize_unicode(charset, fixup): def _optimize_unicode(charset, fixup):
try:
import array
except ImportError:
return charset
charmap = [0]*65536 charmap = [0]*65536
negate = 0 negate = 0
for op, av in charset: try:
if op is NEGATE: for op, av in charset:
negate = 1 if op is NEGATE:
elif op is LITERAL: negate = 1
charmap[fixup(av)] = 1 elif op is LITERAL:
elif op is RANGE: charmap[fixup(av)] = 1
for i in range(fixup(av[0]), fixup(av[1])+1): elif op is RANGE:
charmap[i] = 1 for i in range(fixup(av[0]), fixup(av[1])+1):
elif op is CATEGORY: charmap[i] = 1
# XXX: could expand category elif op is CATEGORY:
return charset # cannot compress # XXX: could expand category
return charset # cannot compress
except IndexError:
# non-BMP characters
return charset
if negate: if negate:
if sys.maxunicode != 65535:
# XXX: negation does not work with big charsets
return charset
for i in range(65536): for i in range(65536):
charmap[i] = not charmap[i] charmap[i] = not charmap[i]
comps = {} comps = {}
...@@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup): ...@@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
block = block + 1 block = block + 1
data = data + _mk_bitmap(chunk) data = data + _mk_bitmap(chunk)
header = [block] header = [block]
assert MAXCODE == 65535 if MAXCODE == 65535:
for i in range(128): code = 'H'
if sys.byteorder == 'big': else:
header.append(256*mapping[2*i]+mapping[2*i+1]) code = 'L'
else: # Convert block indices to byte array of 256 bytes
header.append(mapping[2*i]+256*mapping[2*i+1]) mapping = array.array('b', mapping).tostring()
# Convert byte array to word array
header = header + array.array(code, mapping).tolist()
data[0:0] = header data[0:0] = header
return [(BIGCHARSET, data)] return [(BIGCHARSET, data)]
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# update when constants are added or removed # update when constants are added or removed
MAGIC = 20010701 MAGIC = 20030419
# max code word in this release # max code word in this release
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
* 2001-10-24 fl added finditer primitive (for 2.2 only) * 2001-10-24 fl added finditer primitive (for 2.2 only)
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
* 2002-11-09 fl fixed empty sub/subn return type * 2002-11-09 fl fixed empty sub/subn return type
* 2003-04-18 mvl fully support 4-byte codes
* *
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
* *
...@@ -510,10 +511,18 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) ...@@ -510,10 +511,18 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
break; break;
case SRE_OP_CHARSET: case SRE_OP_CHARSET:
/* <CHARSET> <bitmap> (16 bits per code word) */ if (sizeof(SRE_CODE) == 2) {
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15)))) /* <CHARSET> <bitmap> (16 bits per code word) */
return ok; if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
set += 16; return ok;
set += 16;
}
else {
/* <CHARSET> <bitmap> (32 bits per code word) */
if (ch < 256 && (set[ch >> 5] & (1 << (ch & 31))))
return ok;
set += 8;
}
break; break;
case SRE_OP_BIGCHARSET: case SRE_OP_BIGCHARSET:
...@@ -521,11 +530,25 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) ...@@ -521,11 +530,25 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
{ {
int count, block; int count, block;
count = *(set++); count = *(set++);
block = ((unsigned char*)set)[ch >> 8];
set += 128; if (sizeof(SRE_CODE) == 2) {
if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15))) block = ((unsigned char*)set)[ch >> 8];
return ok; set += 128;
set += count*16; if (set[block*16 + ((ch & 255)>>4)] & (1 << (ch & 15)))
return ok;
set += count*16;
}
else {
if (ch < 65536)
block = ((unsigned char*)set)[ch >> 8];
else
block = -1;
set += 64;
if (block >=0 &&
(set[block*8 + ((ch & 255)>>5)] & (1 << (ch & 31))))
return ok;
set += count*8;
}
break; break;
} }
...@@ -1371,7 +1394,10 @@ _compile(PyObject* self_, PyObject* args) ...@@ -1371,7 +1394,10 @@ _compile(PyObject* self_, PyObject* args)
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
PyObject *o = PyList_GET_ITEM(code, i); PyObject *o = PyList_GET_ITEM(code, i);
self->code[i] = (SRE_CODE) PyInt_AsLong(o); if (PyInt_Check(o))
self->code[i] = (SRE_CODE) PyInt_AsLong(o);
else
self->code[i] = (SRE_CODE) PyLong_AsUnsignedLong(o);
} }
if (PyErr_Occurred()) { if (PyErr_Occurred()) {
...@@ -3045,6 +3071,12 @@ PyMODINIT_FUNC init_sre(void) ...@@ -3045,6 +3071,12 @@ PyMODINIT_FUNC init_sre(void)
Py_DECREF(x); Py_DECREF(x);
} }
x = PyInt_FromLong(sizeof(SRE_CODE));
if (x) {
PyDict_SetItemString(d, "CODESIZE", x);
Py_DECREF(x);
}
x = PyString_FromString(copyright); x = PyString_FromString(copyright);
if (x) { if (x) {
PyDict_SetItemString(d, "copyright", x); PyDict_SetItemString(d, "copyright", x);
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution. * See the _sre.c file for information on usage and redistribution.
*/ */
#define SRE_MAGIC 20010701 #define SRE_MAGIC 20030419
#define SRE_OP_FAILURE 0 #define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1 #define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2 #define SRE_OP_ANY 2
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment