Skip to content

Commit

Permalink
SV hierarchy: Extract SV PVs. (#294)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Apr 2, 2024
1 parent 4e7c51d commit 57056ea
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 0 deletions.
32 changes: 32 additions & 0 deletions simple/stats/schema_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright 2024 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Set of properties that should not be considered for building the hierarchy.
# Ref[1]: https://source.corp.google.com/piper///depot/google3/datacommons/import/mcf_vocab.cc;l=142;rcl=621036089
# Includes all props from [1].
# Also includes a Custom DC specific property "includedIn".
SV_HIERARCHY_PROPS_BLOCKLIST: set[str] = {
"dcid", "typeOf", "isPublic", "provenance", "resMCFFile", "keyString",
"populationType", "constraintProperties", "name", "label", "alternateName",
"description", "descriptionUrl", "memberOf", "utteranceTemplate", "source",
"footnote", "isNormalizable", "denominatorForNormalization",
"measuredProperty", "measurementMethod", "measurementDenominator",
"measurementQualifier", "scalingFactor", "unit", "statType",
"censusACSTableId", "includedIn"
}

PREDICATE_TYPE_OF = "typeOf"
PREDICATE_POPULATION_TYPE = "populationType"
TYPE_STATISTICAL_VARIABLE = "StatisticalVariable"
DEFAULT_POPULATION_TYPE = "Thing"
74 changes: 74 additions & 0 deletions simple/stats/stat_var_hierarchy_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright 2024 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging

from stats import schema_constants
from stats.data import Triple


class StatVarHierarchyGenerator:
"""Given a list of input triples (including stat vars),
generates a SV hierarchy and returns a list of output triples
representing the hierarchy.
"""

def __init__(self, triples: list[Triple]) -> None:
# Build SV PVs.
self.sv_pvs = StatVarPVs(triples)
# TODO: Create SVG + SV tree.
# TODO: Generate SVG + SV triples.


class StatVarPVs:
"""An intermediate helper object that extracts SVs from input triples
and puts their PVs in a dict for quick lookup later.
It also maintains a separate dict for population types since SV hierarchies
are rooted at a group representing their population type.
"""

def __init__(self, triples: list[Triple]) -> None:
self.sv_id_2_population_type: dict[str, str] = {}
self.sv_id_2_pvs: dict[str, dict[str, str]] = {}

# Collect all triples into pop type and pv dictionaries.
# Also collect SV DCIDs for filtering subsequently.

# Using dict instead of set to maintain order.
# Maintaining order maintains results consistency and helps with tests.
sv_ids: dict[str, bool] = {}
dcid2poptype: dict[str, str] = {}
dcid2pvs: dict[str, dict[str, str]] = {}

for triple in triples:
value = triple.object_id or triple.object_value
if not value:
logging.warning("Skipping, no value found for triple (%s).",
str(triple))
continue

if triple.predicate == schema_constants.PREDICATE_TYPE_OF:
if value == schema_constants.TYPE_STATISTICAL_VARIABLE:
sv_ids[triple.subject_id] = True
elif triple.predicate == schema_constants.PREDICATE_POPULATION_TYPE:
dcid2poptype[triple.subject_id] = value
elif triple.predicate not in schema_constants.SV_HIERARCHY_PROPS_BLOCKLIST:
pvs = dcid2pvs.setdefault(triple.subject_id, {})
pvs[triple.predicate] = value

# Filter SVs.
for sv_id in sv_ids.keys():
self.sv_id_2_population_type[sv_id] = dcid2poptype.get(
sv_id, schema_constants.DEFAULT_POPULATION_TYPE)
self.sv_id_2_pvs[sv_id] = dcid2pvs.get(sv_id, {})
62 changes: 62 additions & 0 deletions simple/tests/stats/stat_var_hierarchy_generator_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright 2024 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from stats.data import Triple
from stats.stat_var_hierarchy_generator import *


class TestStatVarHierarchyGenerator(unittest.TestCase):

def test_stat_var_pvs(self):
input_triples: list[Triple] = [
Triple("sv1", "typeOf", "StatisticalVariable", ""),
Triple("sv1", "populationType", "Person", ""),
Triple("sv1", "gender", "Female", ""),
Triple("sv1", "race", "Asian", ""),
Triple("sv1", "utteranceTemplate", "", "SV1 utterance"),
Triple("non_sv1", "typeOf", "Person", ""),
Triple("non_sv1", "gender", "Male", ""),
Triple("non_sv1", "race", "AmericanIndianOrAlaskaNative", ""),
Triple("non_sv1", "name", "", "Joe Doe"),
Triple("sv2", "typeOf", "StatisticalVariable", ""),
Triple("sv2", "populationType", "Coal", ""),
Triple("sv2", "energySource", "CokeCoal", ""),
Triple("sv2", "statType", "measuredValue", ""),
Triple("sv3", "typeOf", "StatisticalVariable", ""),
]

expected_sv_id_2_population_type = {
"sv1": "Person",
"sv2": "Coal",
"sv3": "Thing"
}

expected_sv_id_2_pvs = {
"sv1": {
"gender": "Female",
"race": "Asian"
},
"sv2": {
"energySource": "CokeCoal"
},
"sv3": {}
}

sv_pvs = StatVarPVs(input_triples)

self.assertDictEqual(sv_pvs.sv_id_2_population_type,
expected_sv_id_2_population_type)
self.assertDictEqual(sv_pvs.sv_id_2_pvs, expected_sv_id_2_pvs)

0 comments on commit 57056ea

Please sign in to comment.