Commit a897aeee authored by Serhiy Storchaka's avatar Serhiy Storchaka Committed by GitHub

bpo-32072: Fix issues with binary plists. (#4455)

* Fixed saving bytearrays.
* Identical objects will be saved only once.
* Equal references will be load as identical objects.
* Added support for saving and loading recursive data structures.
parent b4d1e1f7
...@@ -525,6 +525,8 @@ class InvalidFileException (ValueError): ...@@ -525,6 +525,8 @@ class InvalidFileException (ValueError):
_BINARY_FORMAT = {1: 'B', 2: 'H', 4: 'L', 8: 'Q'} _BINARY_FORMAT = {1: 'B', 2: 'H', 4: 'L', 8: 'Q'}
_undefined = object()
class _BinaryPlistParser: class _BinaryPlistParser:
""" """
Read or write a binary plist file, following the description of the binary Read or write a binary plist file, following the description of the binary
...@@ -555,7 +557,8 @@ class _BinaryPlistParser: ...@@ -555,7 +557,8 @@ class _BinaryPlistParser:
) = struct.unpack('>6xBBQQQ', trailer) ) = struct.unpack('>6xBBQQQ', trailer)
self._fp.seek(offset_table_offset) self._fp.seek(offset_table_offset)
self._object_offsets = self._read_ints(num_objects, offset_size) self._object_offsets = self._read_ints(num_objects, offset_size)
return self._read_object(self._object_offsets[top_object]) self._objects = [_undefined] * num_objects
return self._read_object(top_object)
except (OSError, IndexError, struct.error, OverflowError, except (OSError, IndexError, struct.error, OverflowError,
UnicodeDecodeError): UnicodeDecodeError):
...@@ -584,62 +587,68 @@ class _BinaryPlistParser: ...@@ -584,62 +587,68 @@ class _BinaryPlistParser:
def _read_refs(self, n): def _read_refs(self, n):
return self._read_ints(n, self._ref_size) return self._read_ints(n, self._ref_size)
def _read_object(self, offset): def _read_object(self, ref):
""" """
read the object at offset. read the object by reference.
May recursively read sub-objects (content of an array/dict/set) May recursively read sub-objects (content of an array/dict/set)
""" """
result = self._objects[ref]
if result is not _undefined:
return result
offset = self._object_offsets[ref]
self._fp.seek(offset) self._fp.seek(offset)
token = self._fp.read(1)[0] token = self._fp.read(1)[0]
tokenH, tokenL = token & 0xF0, token & 0x0F tokenH, tokenL = token & 0xF0, token & 0x0F
if token == 0x00: if token == 0x00:
return None result = None
elif token == 0x08: elif token == 0x08:
return False result = False
elif token == 0x09: elif token == 0x09:
return True result = True
# The referenced source code also mentions URL (0x0c, 0x0d) and # The referenced source code also mentions URL (0x0c, 0x0d) and
# UUID (0x0e), but neither can be generated using the Cocoa libraries. # UUID (0x0e), but neither can be generated using the Cocoa libraries.
elif token == 0x0f: elif token == 0x0f:
return b'' result = b''
elif tokenH == 0x10: # int elif tokenH == 0x10: # int
return int.from_bytes(self._fp.read(1 << tokenL), result = int.from_bytes(self._fp.read(1 << tokenL),
'big', signed=tokenL >= 3) 'big', signed=tokenL >= 3)
elif token == 0x22: # real elif token == 0x22: # real
return struct.unpack('>f', self._fp.read(4))[0] result = struct.unpack('>f', self._fp.read(4))[0]
elif token == 0x23: # real elif token == 0x23: # real
return struct.unpack('>d', self._fp.read(8))[0] result = struct.unpack('>d', self._fp.read(8))[0]
elif token == 0x33: # date elif token == 0x33: # date
f = struct.unpack('>d', self._fp.read(8))[0] f = struct.unpack('>d', self._fp.read(8))[0]
# timestamp 0 of binary plists corresponds to 1/1/2001 # timestamp 0 of binary plists corresponds to 1/1/2001
# (year of Mac OS X 10.0), instead of 1/1/1970. # (year of Mac OS X 10.0), instead of 1/1/1970.
return datetime.datetime(2001, 1, 1) + datetime.timedelta(seconds=f) result = (datetime.datetime(2001, 1, 1) +
datetime.timedelta(seconds=f))
elif tokenH == 0x40: # data elif tokenH == 0x40: # data
s = self._get_size(tokenL) s = self._get_size(tokenL)
if self._use_builtin_types: if self._use_builtin_types:
return self._fp.read(s) result = self._fp.read(s)
else: else:
return Data(self._fp.read(s)) result = Data(self._fp.read(s))
elif tokenH == 0x50: # ascii string elif tokenH == 0x50: # ascii string
s = self._get_size(tokenL) s = self._get_size(tokenL)
result = self._fp.read(s).decode('ascii') result = self._fp.read(s).decode('ascii')
return result result = result
elif tokenH == 0x60: # unicode string elif tokenH == 0x60: # unicode string
s = self._get_size(tokenL) s = self._get_size(tokenL)
return self._fp.read(s * 2).decode('utf-16be') result = self._fp.read(s * 2).decode('utf-16be')
# tokenH == 0x80 is documented as 'UID' and appears to be used for # tokenH == 0x80 is documented as 'UID' and appears to be used for
# keyed-archiving, not in plists. # keyed-archiving, not in plists.
...@@ -647,8 +656,9 @@ class _BinaryPlistParser: ...@@ -647,8 +656,9 @@ class _BinaryPlistParser:
elif tokenH == 0xA0: # array elif tokenH == 0xA0: # array
s = self._get_size(tokenL) s = self._get_size(tokenL)
obj_refs = self._read_refs(s) obj_refs = self._read_refs(s)
return [self._read_object(self._object_offsets[x]) result = []
for x in obj_refs] self._objects[ref] = result
result.extend(self._read_object(x) for x in obj_refs)
# tokenH == 0xB0 is documented as 'ordset', but is not actually # tokenH == 0xB0 is documented as 'ordset', but is not actually
# implemented in the Apple reference code. # implemented in the Apple reference code.
...@@ -661,13 +671,16 @@ class _BinaryPlistParser: ...@@ -661,13 +671,16 @@ class _BinaryPlistParser:
key_refs = self._read_refs(s) key_refs = self._read_refs(s)
obj_refs = self._read_refs(s) obj_refs = self._read_refs(s)
result = self._dict_type() result = self._dict_type()
self._objects[ref] = result
for k, o in zip(key_refs, obj_refs): for k, o in zip(key_refs, obj_refs):
result[self._read_object(self._object_offsets[k]) result[self._read_object(k)] = self._read_object(o)
] = self._read_object(self._object_offsets[o])
return result
else:
raise InvalidFileException() raise InvalidFileException()
self._objects[ref] = result
return result
def _count_to_size(count): def _count_to_size(count):
if count < 1 << 8: if count < 1 << 8:
return 1 return 1
...@@ -681,6 +694,8 @@ def _count_to_size(count): ...@@ -681,6 +694,8 @@ def _count_to_size(count):
else: else:
return 8 return 8
_scalars = (str, int, float, datetime.datetime, bytes)
class _BinaryPlistWriter (object): class _BinaryPlistWriter (object):
def __init__(self, fp, sort_keys, skipkeys): def __init__(self, fp, sort_keys, skipkeys):
self._fp = fp self._fp = fp
...@@ -736,8 +751,7 @@ class _BinaryPlistWriter (object): ...@@ -736,8 +751,7 @@ class _BinaryPlistWriter (object):
# First check if the object is in the object table, not used for # First check if the object is in the object table, not used for
# containers to ensure that two subcontainers with the same contents # containers to ensure that two subcontainers with the same contents
# will be serialized as distinct values. # will be serialized as distinct values.
if isinstance(value, ( if isinstance(value, _scalars):
str, int, float, datetime.datetime, bytes, bytearray)):
if (type(value), value) in self._objtable: if (type(value), value) in self._objtable:
return return
...@@ -745,15 +759,17 @@ class _BinaryPlistWriter (object): ...@@ -745,15 +759,17 @@ class _BinaryPlistWriter (object):
if (type(value.data), value.data) in self._objtable: if (type(value.data), value.data) in self._objtable:
return return
elif id(value) in self._objidtable:
return
# Add to objectreference map # Add to objectreference map
refnum = len(self._objlist) refnum = len(self._objlist)
self._objlist.append(value) self._objlist.append(value)
try: if isinstance(value, _scalars):
if isinstance(value, Data): self._objtable[(type(value), value)] = refnum
elif isinstance(value, Data):
self._objtable[(type(value.data), value.data)] = refnum self._objtable[(type(value.data), value.data)] = refnum
else: else:
self._objtable[(type(value), value)] = refnum
except TypeError:
self._objidtable[id(value)] = refnum self._objidtable[id(value)] = refnum
# And finally recurse into containers # And finally recurse into containers
...@@ -780,12 +796,11 @@ class _BinaryPlistWriter (object): ...@@ -780,12 +796,11 @@ class _BinaryPlistWriter (object):
self._flatten(o) self._flatten(o)
def _getrefnum(self, value): def _getrefnum(self, value):
try: if isinstance(value, _scalars):
if isinstance(value, Data): return self._objtable[(type(value), value)]
elif isinstance(value, Data):
return self._objtable[(type(value.data), value.data)] return self._objtable[(type(value.data), value.data)]
else: else:
return self._objtable[(type(value), value)]
except TypeError:
return self._objidtable[id(value)] return self._objidtable[id(value)]
def _write_size(self, token, size): def _write_size(self, token, size):
......
...@@ -169,6 +169,17 @@ class TestPlistlib(unittest.TestCase): ...@@ -169,6 +169,17 @@ class TestPlistlib(unittest.TestCase):
self.assertRaises(OverflowError, plistlib.dumps, self.assertRaises(OverflowError, plistlib.dumps,
pl, fmt=fmt) pl, fmt=fmt)
def test_bytearray(self):
for pl in (b'<binary gunk>', b"<lots of binary gunk>\0\1\2\3" * 10):
for fmt in ALL_FORMATS:
with self.subTest(pl=pl, fmt=fmt):
data = plistlib.dumps(bytearray(pl), fmt=fmt)
pl2 = plistlib.loads(data)
self.assertIsInstance(pl2, bytes)
self.assertEqual(pl2, pl)
data2 = plistlib.dumps(pl2, fmt=fmt)
self.assertEqual(data, data2)
def test_bytes(self): def test_bytes(self):
pl = self._create() pl = self._create()
data = plistlib.dumps(pl) data = plistlib.dumps(pl)
...@@ -431,6 +442,9 @@ class TestPlistlib(unittest.TestCase): ...@@ -431,6 +442,9 @@ class TestPlistlib(unittest.TestCase):
pl2 = plistlib.loads(data) pl2 = plistlib.loads(data)
self.assertEqual(dict(pl), dict(pl2)) self.assertEqual(dict(pl), dict(pl2))
class TestBinaryPlistlib(unittest.TestCase):
def test_nonstandard_refs_size(self): def test_nonstandard_refs_size(self):
# Issue #21538: Refs and offsets are 24-bit integers # Issue #21538: Refs and offsets are 24-bit integers
data = (b'bplist00' data = (b'bplist00'
...@@ -443,6 +457,47 @@ class TestPlistlib(unittest.TestCase): ...@@ -443,6 +457,47 @@ class TestPlistlib(unittest.TestCase):
b'\x00\x00\x00\x00\x00\x00\x00\x13') b'\x00\x00\x00\x00\x00\x00\x00\x13')
self.assertEqual(plistlib.loads(data), {'a': 'b'}) self.assertEqual(plistlib.loads(data), {'a': 'b'})
def test_dump_duplicates(self):
# Test effectiveness of saving duplicated objects
for x in (None, False, True, 12345, 123.45, 'abcde', b'abcde',
datetime.datetime(2004, 10, 26, 10, 33, 33),
plistlib.Data(b'abcde'), bytearray(b'abcde'),
[12, 345], (12, 345), {'12': 345}):
with self.subTest(x=x):
data = plistlib.dumps([x]*1000, fmt=plistlib.FMT_BINARY)
self.assertLess(len(data), 1100, repr(data))
def test_identity(self):
for x in (None, False, True, 12345, 123.45, 'abcde', b'abcde',
datetime.datetime(2004, 10, 26, 10, 33, 33),
plistlib.Data(b'abcde'), bytearray(b'abcde'),
[12, 345], (12, 345), {'12': 345}):
with self.subTest(x=x):
data = plistlib.dumps([x]*2, fmt=plistlib.FMT_BINARY)
a, b = plistlib.loads(data)
if isinstance(x, tuple):
x = list(x)
self.assertEqual(a, x)
self.assertEqual(b, x)
self.assertIs(a, b)
def test_cycles(self):
# recursive list
a = []
a.append(a)
b = plistlib.loads(plistlib.dumps(a, fmt=plistlib.FMT_BINARY))
self.assertIs(b[0], b)
# recursive tuple
a = ([],)
a[0].append(a)
b = plistlib.loads(plistlib.dumps(a, fmt=plistlib.FMT_BINARY))
self.assertIs(b[0][0], b)
# recursive dict
a = {}
a['x'] = a
b = plistlib.loads(plistlib.dumps(a, fmt=plistlib.FMT_BINARY))
self.assertIs(b['x'], b)
def test_large_timestamp(self): def test_large_timestamp(self):
# Issue #26709: 32-bit timestamp out of range # Issue #26709: 32-bit timestamp out of range
for ts in -2**31-1, 2**31: for ts in -2**31-1, 2**31:
......
Fixed issues with binary plists:
* Fixed saving bytearrays.
* Identical objects will be saved only once.
* Equal references will be load as identical objects.
* Added support for saving and loading recursive data structures.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment