Skip to content

Commit

Permalink
Integrate vertical tagging into the main runner. (#307)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored May 17, 2024
1 parent 88fc0db commit a1b4df3
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 9 deletions.
9 changes: 9 additions & 0 deletions simple/stats/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import re

from stats import constants
from stats.data import AggregationConfig
from stats.data import EntityType
from stats.data import EventType
Expand Down Expand Up @@ -169,6 +170,14 @@ def database(self) -> dict:
def generate_hierarchy(self) -> bool:
return self.data.get(_GROUP_STAT_VARS_BY_PROPERTY) or False

def special_files(self) -> dict[str, str]:
special_files: dict[str, str] = {}
for special_file_type in constants.SPECIAL_FILE_TYPES:
special_file = self.data.get(special_file_type, "")
if special_file:
special_files[special_file] = special_file_type
return special_files

def generate_topics(self) -> bool:
return self.data.get(_GENERATE_TOPICS) or False

Expand Down
5 changes: 5 additions & 0 deletions simple/stats/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,8 @@
# DC links
DC_HOME = "https://datacommons.org"
DC_BROWSER = "https://datacommons.org/browser"

# "Special" file types.
# i.e. files of these types will be handled in specific ways.
VERTICAL_SPECS_FILE_TYPE = "verticalSpecsFile"
SPECIAL_FILE_TYPES = set([VERTICAL_SPECS_FILE_TYPE])
36 changes: 33 additions & 3 deletions simple/stats/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ def __init__(self,

self.mode = mode
self.input_handlers: list[FileHandler] = []
# "Special" file handlers.
# i.e. if files of these types are present, they are handled in specific ways.
self.special_handlers: dict[str, FileHandler] = {}

# Config file driven.
if config_file:
Expand Down Expand Up @@ -90,6 +93,8 @@ def __init__(self,
raise FileNotFoundError("Config file must be provided.")
self.config = Config(data=json.loads(config_fh.read_string()))

self.special_file_to_type = self.config.special_files()

# Output directories
self.output_dir_fh = create_file_handler(output_dir, is_dir=True)
self.nl_dir_fh = self.output_dir_fh.make_file(f"{constants.NL_DIR_NAME}/")
Expand Down Expand Up @@ -182,28 +187,53 @@ def _generate_svg_hierarchy(self):
logging.info("No SV triples found, skipping SVG generating hierarchy.")
logging.info("Generating SVG hierarchy for %s SV triples.", len(sv_triples))

# TODO: Load vertical specs from if a "dc.vertical_specs.json" file exists.
vertical_specs: list[VerticalSpec] = []
vertical_specs_fh = self.special_handlers.get(
constants.VERTICAL_SPECS_FILE_TYPE)
if vertical_specs_fh:
logging.info("Loading vertical specs from: %s",
vertical_specs_fh.basename())
vertical_specs = stat_var_hierarchy_generator.load_vertical_specs(
vertical_specs_fh.read_string())
svg_triples = stat_var_hierarchy_generator.generate(sv_triples,
vertical_specs)
logging.info("Inserting %s SVG triples into DB.", len(svg_triples))
self.db.insert_triples(svg_triples)

# If the fh is a "special" file, append it to the self.special_handlers dict.
# Returns true if it is, otherwise false.
def _maybe_set_special_fh(self, fh: FileHandler) -> bool:
file_name = fh.basename()
file_type = self.special_file_to_type.get(file_name)
if file_type:
self.special_handlers[file_type] = fh
return True
return False

def _run_imports(self):
input_fhs: list[FileHandler] = []
input_mcf_fhs: list[FileHandler] = []
for input_handler in self.input_handlers:
if not input_handler.isdir:
if self._maybe_set_special_fh(input_handler):
continue
input_file_name = input_handler.basename()
if input_file_name.endswith(".mcf"):
input_mcf_fhs.append(input_handler)
else:
input_fhs.append(input_handler)
else:
for input_file in sorted(input_handler.list_files(extension=".csv")):
input_fhs.append(input_handler.make_file(input_file))
fh = input_handler.make_file(input_file)
if not self._maybe_set_special_fh(fh):
input_fhs.append(fh)
for input_file in sorted(input_handler.list_files(extension=".mcf")):
input_mcf_fhs.append(input_handler.make_file(input_file))
fh = input_handler.make_file(input_file)
if not self._maybe_set_special_fh(fh):
input_mcf_fhs.append(fh)
for input_file in sorted(input_handler.list_files(extension=".json")):
fh = input_handler.make_file(input_file)
self._maybe_set_special_fh(fh)

self.reporter.report_started(import_files=list(
map(lambda fh: fh.basename(), input_fhs + input_mcf_fhs)))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
subject_id,predicate,object_id,object_value
some_var1,typeOf,StatisticalVariable,
some_var1,measuredProperty,value,
some_var1,measuredProperty,count,
some_var1,name,,Some Variable 1 Name
some_var1,description,,Some Variable 1 Description
some_var1,populationType,Person,
some_var1,gender,Female,
some_var2,typeOf,StatisticalVariable,
some_var2,measuredProperty,value,
some_var2,measuredProperty,age,
some_var2,name,,Some Variable 2 Name
some_var2,description,,Some Variable 2 Description
some_var2,populationType,Person,
Expand All @@ -28,9 +28,16 @@ c/p/1,url,,http://source1.com/provenance1
c/g/Root,typeOf,StatVarGroup,
c/g/Root,name,,Custom Variables
c/g/Root,specializationOf,dc/g/Root,
c/g/PersonAgeVertical,typeOf,StatVarGroup,
c/g/PersonAgeVertical,name,,Person Age Vertical
c/g/PersonAgeVertical,specializationOf,c/g/Root,
c/g/PersonCountVertical,typeOf,StatVarGroup,
c/g/PersonCountVertical,name,,Person Count Vertical
c/g/PersonCountVertical,specializationOf,c/g/Root,
c/g/Person,typeOf,StatVarGroup,
c/g/Person,name,,Person
c/g/Person,specializationOf,c/g/Root,
c/g/Person,specializationOf,c/g/PersonCountVertical,
c/g/Person,specializationOf,c/g/PersonAgeVertical,
c/g/Person_Gender,typeOf,StatVarGroup,
c/g/Person_Gender,name,,Person With Gender
c/g/Person_Gender,specializationOf,c/g/Person,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
}
}
},
"groupStatVarsByProperty": true
"groupStatVarsByProperty": true,
"verticalSpecsFile": "vertical_specs.json"
}
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
Node: v1
dcid:"some_var1"
typeOf: dcs:StatisticalVariable
measuredProperty: dcs:value
measuredProperty: dcs:count
name: "Some Variable 1 Name"
description: "Some Variable 1 Description"
populationType: schema:Person
gender: dcs:Female

Node: dcid:some_var2
typeOf: dcs:StatisticalVariable
measuredProperty: dcs:value
measuredProperty: dcs:age
name: "Some Variable 2 Name"
description: "Some Variable 2 Description"
populationType: schema:Person
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"specs": [
{
"populationType": "Person",
"measuredProperties": ["count"],
"verticals": ["PersonCountVertical"]
},
{
"populationType": "Person",
"measuredProperties": ["age"],
"verticals": ["PersonAgeVertical"]
}
]
}

0 comments on commit a1b4df3

Please sign in to comment.