Skip to content

Commit 9953adb

Browse files
Filter nested properties based on metadata (#130)
* Support filtering of nested fields Update filter_data_by_metadata function to allow filtering of nested fields - e.g. if property `address` has selected set to True, but property `address.street` has selected set to False, only the street would be excluded. Processes data recursively. * Update transform.py make formatting a little clearer * Update transform.py Fix array type breadcrumb name * Update transform.py breadcrumb path documentation * Update transform.py change based on tests - must remove field from data object, not just set value to None. * Update transform.py line lenght :) * Add tests for filtering nested fields * Make pylint happy * Simplify one line Co-authored-by: Chris Goddard <[email protected]>
1 parent 6c6c773 commit 9953adb

File tree

3 files changed

+68
-13
lines changed

3 files changed

+68
-13
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ install: check_prereqs
88
python3 -m pip install -e '.[dev]'
99

1010
test: install
11-
pylint singer -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access
11+
pylint singer --extension-pkg-whitelist=ciso8601 -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access
1212
nosetests --with-doctest -v

singer/transform.py

+24-12
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,16 @@ def unix_seconds_to_datetime(value):
3636
return strftime(datetime.datetime.fromtimestamp(int(value), datetime.timezone.utc))
3737

3838

39+
def breadcrumb_path(breadcrumb):
40+
"""
41+
Transform breadcrumb into familiar object dot-notation
42+
"""
43+
name = ".".join(breadcrumb)
44+
name = name.replace('properties.', '')
45+
name = name.replace('.items', '[]')
46+
return name
47+
48+
3949
class SchemaMismatch(Exception):
4050
def __init__(self, errors):
4151
if not errors:
@@ -46,7 +56,7 @@ def __init__(self, errors):
4656
msg = "Errors during transform\n\t{}".format("\n\t".join(estrs))
4757
msg += "\n\n\nErrors during transform: [{}]".format(", ".join(estrs))
4858

49-
super(SchemaMismatch, self).__init__(msg)
59+
super().__init__(msg)
5060

5161
class SchemaKey:
5262
ref = "$ref"
@@ -110,25 +120,27 @@ def __enter__(self):
110120
def __exit__(self, *args):
111121
self.log_warning()
112122

113-
def filter_data_by_metadata(self, data, metadata):
123+
def filter_data_by_metadata(self, data, metadata, parent=()):
114124
if isinstance(data, dict) and metadata:
115125
for field_name in list(data.keys()):
116-
selected = singer.metadata.get(metadata, ('properties', field_name), 'selected')
117-
inclusion = singer.metadata.get(metadata, ('properties', field_name), 'inclusion')
126+
breadcrumb = parent + ('properties', field_name)
127+
selected = singer.metadata.get(metadata, breadcrumb, 'selected')
128+
inclusion = singer.metadata.get(metadata, breadcrumb, 'inclusion')
118129
if inclusion == 'automatic':
119130
continue
120131

121-
if selected is False:
132+
if (selected is False) or (inclusion == 'unsupported'):
122133
data.pop(field_name, None)
123134
# Track that a field was filtered because the customer
124-
# didn't select it.
125-
self.filtered.add(field_name)
135+
# didn't select it or the tap declared it as unsupported.
136+
self.filtered.add(breadcrumb_path(breadcrumb))
137+
else:
138+
data[field_name] = self.filter_data_by_metadata(
139+
data[field_name], metadata, breadcrumb)
126140

127-
if inclusion == 'unsupported':
128-
data.pop(field_name, None)
129-
# Track that the field was filtered because the tap
130-
# declared it as unsupported.
131-
self.filtered.add(field_name)
141+
if isinstance(data, list) and metadata:
142+
breadcrumb = parent + ('items',)
143+
data = [self.filter_data_by_metadata(d, metadata, breadcrumb) for d in data]
132144

133145
return data
134146

tests/test_transform.py

+43
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,49 @@ def test_drops_fields_which_are_unsupported(self):
295295
dict_value = {"name": "chicken"}
296296
self.assertEqual({}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
297297

298+
def test_drops_nested_object_fields_which_are_unselected(self):
299+
schema = {"type": "object",
300+
"properties": {"addr": {"type": "object",
301+
"properties": {"addr1": {"type": "string"},
302+
"city": {"type": "string"},
303+
"state": {"type": "string"},
304+
'amount': {'type': 'integer'}}}}}
305+
metadata = {
306+
('properties','addr'): {"selected": True},
307+
('properties','addr', 'properties','amount'): {"selected": False}
308+
}
309+
data = {'addr':
310+
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'}
311+
}
312+
expected = {'addr':
313+
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'},
314+
}
315+
self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
316+
317+
def test_drops_nested_array_fields_which_are_unselected(self):
318+
schema = {"type": "object",
319+
"properties": {"addrs": {"type": "array",
320+
"items": {"type": "object",
321+
"properties": {"addr1": {"type": "string"},
322+
"city": {"type": "string"},
323+
"state": {"type": "string"},
324+
'amount': {'type': 'integer'}}}}}}
325+
metadata = {
326+
('properties','addrs'): {"selected": True},
327+
('properties','addrs','items','properties','amount'): {"selected": False}
328+
}
329+
data = {'addrs': [
330+
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'},
331+
{'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2', 'amount': '456'}
332+
]
333+
}
334+
expected = {'addrs': [
335+
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'},
336+
{'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2'}
337+
]
338+
}
339+
self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))
340+
298341
class TestResolveSchemaReferences(unittest.TestCase):
299342
def test_internal_refs_resolve(self):
300343
schema = {"type": "object",

0 commit comments

Comments
 (0)