-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdefined_serializers.py
170 lines (132 loc) · 4.49 KB
/
defined_serializers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import json
import pickle
import sqlite3
import hashlib
import datetime
try:
import numpy as np
except ImportError:
np = None
class DefinedTypes:
number = "number"
string = "string"
boolean = "boolean"
datetime = "datetime"
compressed_string = "compressed_string"
blob = "blob"
other = "other"
json = "json"
normalized_embedding = "normalized_embedding"
schema_property_to_column_type = {
DefinedTypes.boolean: "INTEGER",
DefinedTypes.string: "TEXT",
DefinedTypes.number: "NUMBER",
DefinedTypes.datetime: "NUMBER",
DefinedTypes.compressed_string: "BLOB",
DefinedTypes.blob: "BLOB",
DefinedTypes.other: "BLOB",
DefinedTypes.json: "JSON",
DefinedTypes.normalized_embedding: "BLOB",
}
def hash_bytes(data):
hash_obj = hashlib.sha256(data)
hex_dig = hash_obj.hexdigest()
return hex_dig
def serialize_record(schema, record, compressor, _id=None, _updated_at=None):
_record = {} if _id is None else {"id": _id, "updated_at": _updated_at}
for k, _type in schema.items():
if k not in record:
continue
v = record[k]
if _type == "boolean":
_record[k] = None if v is None else int(v)
elif _type == "string":
_record[k] = v
elif _type == "number":
_record[k] = v
elif _type == "datetime":
_record[k] = None if v is None else v.timestamp()
elif _type == "compressed_string":
_record[k] = (
None
if v is None
else compressor.compress(v.encode())
if compressor is not False
else v.encode()
)
# blob
elif _type == "blob":
_record[f"__size_{k}"] = None if v is None else len(v)
_record[f"__hash_{k}"] = None if v is None else hash_bytes(v)
_record[k] = (
None
if v is None
else compressor.compress(v)
if compressor is not False
else v
)
elif _type == "other":
v = None if v is None else pickle.dumps(v, protocol=pickle.HIGHEST_PROTOCOL)
_record[f"__size_{k}"] = None if v is None else len(v)
_record[f"__hash_{k}"] = None if v is None else hash_bytes(v)
_record[k] = (
None
if v is None
else compressor.compress(v)
if compressor is not False
else v
)
elif _type == "json":
_record[k] = None if v is None else json.dumps(v)
elif _type == "normalized_embedding":
if v is not None:
try:
if v.ndim == 1 and v.dtype == np.float32:
v = v.tobytes()
else:
raise ValueError("Invalid embedding")
except Exception:
raise ValueError("Invalid embedding")
_record[k] = v
return _record
def deserialize_record(schema, record, decompressor):
_record = {}
for k, v in record.items():
key_type = schema[k]
if key_type == "boolean":
_record[k] = None if v is None else bool(v)
elif key_type == "string":
_record[k] = v
elif key_type == "number":
_record[k] = v
elif key_type == "datetime":
_record[k] = None if v is None else datetime.datetime.fromtimestamp(v)
elif key_type == "compressed_string":
_record[k] = (
None
if v is None
else decompressor.decompress(v).decode()
if decompressor is not False
else v.decode()
)
elif key_type == "blob":
_record[k] = (
None
if v is None
else decompressor.decompress(v)
if decompressor is not False
else v
)
elif key_type == "other":
_record[k] = (
None
if v is None
else pickle.loads(decompressor.decompress(v))
if decompressor is not False
else pickle.loads(v)
)
elif key_type == "json":
_record[k] = None if v is None else json.loads(v)
elif key_type == "normalized_embedding":
_record[k] = None if v is None else np.frombuffer(v, dtype=np.float32)
return _record