Skip to content

Commit c7c6ece

Browse files
committed
Determinism
1 parent d2f916c commit c7c6ece

File tree

4 files changed

+169
-16
lines changed

4 files changed

+169
-16
lines changed

README.md

+10
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,16 @@ as keyword arguments to several ``dill`` functions:
117117
remote system which does not have the original file on disk. Options are
118118
*HANDLE_FMODE* for just the handle, *CONTENTS_FMODE* for the file content
119119
and *FILE_FMODE* for content and handle.
120+
* with *deterministic=True*, dill will try to make pickles more likely to
121+
be the same if an object is pickled multiple times. Currently, here is
122+
the feature set:
123+
* `set` and `frozenset` will be sorted before being pickled.
124+
* Subclasses of `set` and `frozenset` will not be effected (and will remain nondeterministic) because they can implement their own `__reduce__` functions which don't have to follow the conventions of `set`'s pickling procedure.
125+
* If the elements are incomparable (e.g. `complex`), they will be sorted by their hash instead. This will not create a natural order of elements that is easy to understand, but if the `__hash__` function of the class doesn't depend on `id`, it will be deterministic.
126+
* If using the faster cPickle based pickler outlined in [#485](https://github.com/uqfoundation/dill/issues/485), this feature may be disabled.
127+
* `dict` and subclasses will remain pickled in insertion order.
128+
* Entries in global variable dictionaries will be in order for each function. The dictionary as a whole, however, will be ordered in visitation order by function and will not be sorted in alphabetical order. This will mean that the globals dictionaries will be deterministic given that the visitation order of functions is deterministic.
129+
* This feature is guaranteed.
120130
* with *ignore=False*, objects reconstructed with types defined in the
121131
top-level script environment use the existing type in the environment
122132
rather than a possibly different reconstructed type.

dill/_dill.py

+99-16
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ def get_file_type(*args, **kwargs):
273273
singletontypes = []
274274

275275
from collections import OrderedDict
276+
from itertools import islice
276277

277278
import inspect
278279

@@ -302,6 +303,8 @@ def __reduce_ex__(self, protocol):
302303
from . import _shims
303304
from ._shims import Reduce, Getattr
304305

306+
from pickle import EMPTY_SET, MARK, ADDITEMS, POP_MARK, FROZENSET, POP
307+
305308
### File modes
306309
#: Pickles the file handle, preserving mode. The position of the unpickled
307310
#: object is as for a new file handle.
@@ -323,7 +326,7 @@ def copy(obj, *args, **kwds):
323326
ignore = kwds.pop('ignore', Unpickler.settings['ignore'])
324327
return loads(dumps(obj, *args, **kwds), ignore=ignore)
325328

326-
def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None):
329+
def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, deterministic=None, **kwds):#, strictio=None):
327330
"""
328331
Pickle an object to a file.
329332
@@ -332,11 +335,11 @@ def dump(obj, file, protocol=None, byref=None, fmode=None, recurse=None, **kwds)
332335
from .settings import settings
333336
protocol = settings['protocol'] if protocol is None else int(protocol)
334337
_kwds = kwds.copy()
335-
_kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse))
338+
_kwds.update(dict(byref=byref, fmode=fmode, recurse=recurse, deterministic=deterministic))
336339
Pickler(file, protocol, **_kwds).dump(obj)
337340
return
338341

339-
def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, strictio=None):
342+
def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, deterministic=None, **kwds):#, strictio=None):
340343
"""
341344
Pickle an object to a string.
342345
@@ -361,7 +364,7 @@ def dumps(obj, protocol=None, byref=None, fmode=None, recurse=None, **kwds):#, s
361364
Default values for keyword arguments can be set in :mod:`dill.settings`.
362365
"""
363366
file = StringIO()
364-
dump(obj, file, protocol, byref, fmode, recurse, **kwds)#, strictio)
367+
dump(obj, file, protocol, byref, fmode, recurse, deterministic, **kwds)#, strictio)
365368
return file.getvalue()
366369

367370
def load(file, ignore=None, **kwds):
@@ -563,13 +566,15 @@ def __init__(self, *args, **kwds):
563566
#_strictio = kwds.pop('strictio', None)
564567
_fmode = kwds.pop('fmode', None)
565568
_recurse = kwds.pop('recurse', None)
569+
_deterministic = kwds.pop('deterministic', None)
566570
StockPickler.__init__(self, *args, **kwds)
567571
self._main = _main_module
568572
self._diff_cache = {}
569573
self._byref = settings['byref'] if _byref is None else _byref
570574
self._strictio = False #_strictio
571575
self._fmode = settings['fmode'] if _fmode is None else _fmode
572576
self._recurse = settings['recurse'] if _recurse is None else _recurse
577+
self._deterministic = settings['deterministic'] if _deterministic is None else _deterministic
573578
from collections import OrderedDict
574579
self._postproc = OrderedDict()
575580

@@ -622,6 +627,91 @@ def save_numpy_array(pickler, obj):
622627
dump.__doc__ = StockPickler.dump.__doc__
623628
pass
624629

630+
# https://github.com/python/cpython/blob/54b5e4da8a4c6ae527ab238fcd6b9ba0a3ed0fc7/Lib/pickle.py#L1009-L1054
631+
# This code MUST be updated if Python changes their implementation.
632+
def save_set(self, obj):
633+
# This if statement was added to sort the elements of the set before
634+
# pickling in the case that a "deterministic" pickle is required. The
635+
# result is not truly deterministic, but it is more stable than would
636+
# otherwise be possible without sorting. If the elements are
637+
# incomparable, the elements will be sorted by hash instead. Some
638+
# objects use the memory location as the hash, which will result in
639+
# non-determinisitc elements regardless.
640+
if getattr(self, '_deterministic', False):
641+
try:
642+
obj_list = obj_maybe_sorted = sorted(obj)
643+
except Exception as e:
644+
w = PicklingWarning("Cannot canonize a set with incomparable members")
645+
w.__cause__ = e
646+
warnings.warn(w)
647+
obj_list = sorted(obj, key=hash)
648+
obj_maybe_sorted = obj_list
649+
else:
650+
obj_list = list(obj)
651+
obj_maybe_sorted = obj
652+
# End determinism code
653+
654+
save = self.save
655+
write = self.write
656+
657+
if self.proto < 4:
658+
self.save_reduce(set, (obj_list,), obj=obj)
659+
return
660+
661+
write(EMPTY_SET)
662+
self.memoize(obj)
663+
664+
it = iter(obj_maybe_sorted)
665+
while True:
666+
batch = list(islice(it, self._BATCHSIZE))
667+
n = len(batch)
668+
if n > 0:
669+
write(MARK)
670+
for item in batch:
671+
save(item)
672+
write(ADDITEMS)
673+
if n < self._BATCHSIZE:
674+
return
675+
dispatch[set] = save_set
676+
677+
def save_frozenset(self, obj):
678+
# Start determinism code. See save_set code for explanation.
679+
if getattr(self, '_deterministic', False):
680+
try:
681+
obj_list = obj_maybe_sorted = sorted(obj)
682+
except Exception as e:
683+
w = PicklingWarning("Cannot canonize a frozenset with incomparable members")
684+
w.__cause__ = e
685+
warnings.warn(w)
686+
obj_list = sorted(obj, key=hash)
687+
obj_maybe_sorted = obj_list
688+
else:
689+
obj_list = list(obj)
690+
obj_maybe_sorted = obj
691+
# End determinism code
692+
693+
save = self.save
694+
write = self.write
695+
696+
if self.proto < 4:
697+
self.save_reduce(frozenset, (obj_list,), obj=obj)
698+
return
699+
700+
write(MARK)
701+
for item in obj_maybe_sorted:
702+
save(item)
703+
704+
if id(obj) in self.memo:
705+
# If the object is already in the memo, this means it is
706+
# recursive. In this case, throw away everything we put on the
707+
# stack, and fetch the object back from the memo.
708+
write(POP_MARK + self.get(self.memo[id(obj)][0]))
709+
return
710+
711+
write(FROZENSET)
712+
self.memoize(obj)
713+
dispatch[frozenset] = save_frozenset
714+
625715
class Unpickler(StockUnpickler):
626716
"""python's Unpickler extended to interpreter sessions and more types"""
627717
from .settings import settings
@@ -1158,10 +1248,7 @@ def _save_with_postproc(pickler, reduction, is_pickler_dill=None, obj=Getattr.NO
11581248
else:
11591249
pickler.save_reduce(*reduction)
11601250
# pop None created by calling preprocessing step off stack
1161-
if PY3:
1162-
pickler.write(bytes('0', 'UTF-8'))
1163-
else:
1164-
pickler.write('0')
1251+
pickler.write(POP)
11651252

11661253
#@register(CodeType)
11671254
#def save_code(pickler, obj):
@@ -1582,10 +1669,7 @@ def save_cell(pickler, obj):
15821669
# The result of this function call will be None
15831670
pickler.save_reduce(_shims._delattr, (obj, 'cell_contents'))
15841671
# pop None created by calling _delattr off stack
1585-
if PY3:
1586-
pickler.write(bytes('0', 'UTF-8'))
1587-
else:
1588-
pickler.write('0')
1672+
pickler.write(POP)
15891673
log.info("# Ce3")
15901674
return
15911675
if is_dill(pickler, child=True):
@@ -1930,6 +2014,8 @@ def save_function(pickler, obj):
19302014
# In the case that the globals are copied, we need to ensure that
19312015
# the globals dictionary is updated when all objects in the
19322016
# dictionary are already created.
2017+
if getattr(pickler, '_deterministic', False):
2018+
globs_copy = dict(sorted(globs_copy.items()))
19332019
if PY3:
19342020
glob_ids = {id(g) for g in globs_copy.values()}
19352021
else:
@@ -1992,10 +2078,7 @@ def save_function(pickler, obj):
19922078
# Change the value of the cell
19932079
pickler.save_reduce(*possible_postproc)
19942080
# pop None created by calling preprocessing step off stack
1995-
if PY3:
1996-
pickler.write(bytes('0', 'UTF-8'))
1997-
else:
1998-
pickler.write('0')
2081+
pickler.write(POP)
19992082

20002083
log.info("# F1")
20012084
else:

dill/settings.py

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
'fmode' : 0, #HANDLE_FMODE
2323
'recurse' : False,
2424
'ignore' : False,
25+
'deterministic' : False,
2526
}
2627

2728
del DEFAULT_PROTOCOL

tests/test_deterministic.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import collections
2+
import dill
3+
import warnings
4+
5+
b = 5
6+
a = 0
7+
c = 7
8+
9+
def test_determinism():
10+
def f():
11+
global a, b, c
12+
return a + b + c
13+
14+
d1 = {'a': 0, 'c': 7, 'b': 5, '__name__': __name__, '__builtins__': __builtins__}
15+
d2 = {'a': 0, 'b': 5, 'c': 7, '__name__': __name__, '__builtins__': __builtins__}
16+
assert dill.dumps(d1) != dill.dumps(d2)
17+
18+
F1 = dill.dumps(f, recurse=True)
19+
F1D = dill.dumps(f, recurse=True, deterministic=True)
20+
21+
qual = f.__qualname__
22+
f = dill._dill.FunctionType(f.__code__, d1, f.__name__, f.__defaults__, f.__closure__)
23+
f.__qualname__ = qual
24+
f.__module__ = '__main__'
25+
26+
assert f.__globals__ is d1
27+
28+
F2 = dill.dumps(f, recurse=True)
29+
F2D = dill.dumps(f, recurse=True, deterministic=True)
30+
31+
f = dill._dill.FunctionType(f.__code__, d2, f.__name__, f.__defaults__, f.__closure__)
32+
f.__qualname__ = qual
33+
f.__module__ = '__main__'
34+
35+
assert f.__globals__ is d2
36+
37+
F3 = dill.dumps(f, recurse=True)
38+
F3D = dill.dumps(f, recurse=True, deterministic=True)
39+
40+
# TODO: actually create a test to verify that the globals are sorted. The
41+
# globalvars function gets the globals dictionary from the module, not the
42+
# function itself, so they will all have the same global namespace.
43+
# assert F2 != F3
44+
# assert F1 != F1D
45+
assert F1D == F2D
46+
assert F2D == F3D
47+
48+
a = {2-1j,2+1j,1+2j,1-2j}
49+
b = a.copy()
50+
b.add(-2)
51+
b.remove(-2)
52+
assert list(a) != list(b)
53+
assert dill.dumps(a, deterministic=True) == dill.dumps(b, deterministic=True)
54+
55+
if __name__ == '__main__':
56+
if dill._dill.PY3:
57+
with warnings.catch_warnings():
58+
warnings.simplefilter("ignore", dill.PickleWarning)
59+
test_determinism()

0 commit comments

Comments
 (0)