Skip to content

Commit

Permalink
Add support for event count aggregations. (#271)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Jan 17, 2024
1 parent 5aa0896 commit 93aa394
Show file tree
Hide file tree
Showing 16 changed files with 211 additions and 60 deletions.
8 changes: 4 additions & 4 deletions simple/sample/output/debug/observations.csv
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ country/ASM,var2,2023,34,c/p/1
country/AIA,var2,2023,42,c/p/1
country/WLF,var2,2023,75,c/p/1
country/ESH,var2,2023,65,c/p/1
geoId/01,custom/statvar_1,2021,555,c/p/1
geoId/122,custom/statvar_1,2022,321,c/p/1
geoId/01,custom/statvar_2,2021,666,c/p/1
geoId/122,custom/statvar_2,2022,123456,c/p/1
geoId/01,Variable_1,2021,555,c/p/1
geoId/122,Variable_1,2022,321,c/p/1
geoId/01,Variable_2,2021,666,c/p/1
geoId/122,Variable_2,2022,123456,c/p/1
country/USA,var1,2021,555,c/p/1
country/IND,var1,2022,321,c/p/1
country/USA,var2,2021,666,c/p/1
Expand Down
20 changes: 10 additions & 10 deletions simple/sample/output/debug/triples.csv
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,16 @@ var2,memberOf,custom/g/group_3,""
var2,includedIn,c/p/1,""
var2,includedIn,c/p/2,""
var2,includedIn,c/s/1,""
custom/statvar_1,typeOf,StatisticalVariable,""
custom/statvar_1,name,"","Variable 1"
custom/statvar_1,memberOf,custom/g/group_1,""
custom/statvar_1,includedIn,c/p/1,""
custom/statvar_1,includedIn,c/s/1,""
custom/statvar_2,typeOf,StatisticalVariable,""
custom/statvar_2,name,"","Variable 2"
custom/statvar_2,memberOf,custom/g/Root,""
custom/statvar_2,includedIn,c/p/1,""
custom/statvar_2,includedIn,c/s/1,""
Variable_1,typeOf,StatisticalVariable,""
Variable_1,name,"","Variable 1"
Variable_1,memberOf,custom/g/group_1,""
Variable_1,includedIn,c/p/1,""
Variable_1,includedIn,c/s/1,""
Variable_2,typeOf,StatisticalVariable,""
Variable_2,name,"","Variable 2"
Variable_2,memberOf,custom/g/Root,""
Variable_2,includedIn,c/p/1,""
Variable_2,includedIn,c/s/1,""
country/AFG,typeOf,Country,""
country/YEM,typeOf,Country,""
country/AGO,typeOf,Country,""
Expand Down
4 changes: 2 additions & 2 deletions simple/sample/output/nl/sentences.csv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
dcid,sentence
var1,Good var1 name;Good var1 description;Natural language sentence 1;Natural language sentence 2
var2,Good var2 name
custom/statvar_1,Variable 1
custom/statvar_2,Variable 2
Variable_1,Variable 1
Variable_2,Variable 2
20 changes: 10 additions & 10 deletions simple/sample/output/triples.csv
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,16 @@ var2,memberOf,custom/g/group_3,
var2,includedIn,c/p/1,
var2,includedIn,c/p/2,
var2,includedIn,c/s/1,
custom/statvar_1,typeOf,StatisticalVariable,
custom/statvar_1,name,,Variable 1
custom/statvar_1,memberOf,custom/g/group_1,
custom/statvar_1,includedIn,c/p/1,
custom/statvar_1,includedIn,c/s/1,
custom/statvar_2,typeOf,StatisticalVariable,
custom/statvar_2,name,,Variable 2
custom/statvar_2,memberOf,custom/g/Root,
custom/statvar_2,includedIn,c/p/1,
custom/statvar_2,includedIn,c/s/1,
Variable_1,typeOf,StatisticalVariable,
Variable_1,name,,Variable 1
Variable_1,memberOf,custom/g/group_1,
Variable_1,includedIn,c/p/1,
Variable_1,includedIn,c/s/1,
Variable_2,typeOf,StatisticalVariable,
Variable_2,name,,Variable 2
Variable_2,memberOf,custom/g/Root,
Variable_2,includedIn,c/p/1,
Variable_2,includedIn,c/s/1,
country/AFG,typeOf,Country,
country/YEM,typeOf,Country,
country/AGO,typeOf,Country,
Expand Down
12 changes: 12 additions & 0 deletions simple/stats/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from stats.data import AggregationConfig
from stats.data import EventType
from stats.data import ImportType
from stats.data import Provenance
Expand All @@ -35,6 +36,8 @@
_EVENT_TYPE_FIELD = "eventType"
_ID_COLUMN_FIELD = "idColumn"
_EVENTS_FIELD = "events"
_COMPUTED_VARIABLES_FIELD = "computedVariables"
_AGGREGATION_FIELD = "aggregation"


class Config:
Expand All @@ -60,6 +63,9 @@ def import_type(self, input_file_name: str) -> ImportType:
f"Unsupported import type: {import_type_str} ({input_file_name})")
return ImportType(import_type_str)

def computed_variables(self, input_file_name: str) -> list[str]:
return self._input_file(input_file_name).get(_COMPUTED_VARIABLES_FIELD, [])

def variable(self, variable_name: str) -> StatVar:
var_cfg = self.data.get(_VARIABLES_FIELD, {}).get(variable_name, {})
return StatVar(
Expand All @@ -70,6 +76,12 @@ def variable(self, variable_name: str) -> StatVar:
group_path=var_cfg.get(_GROUP_FIELD, ""),
)

def aggregation(self, variable_name: str) -> AggregationConfig:
aggregation_cfg = self.data.get(_VARIABLES_FIELD, {}) \
.get(variable_name, {}) \
.get(_AGGREGATION_FIELD, {})
return AggregationConfig(**aggregation_cfg)

def event_type(self, input_file_name: str) -> str:
return self._input_file(input_file_name).get(_EVENT_TYPE_FIELD, "")

Expand Down
22 changes: 22 additions & 0 deletions simple/stats/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,3 +267,25 @@ def triples(self) -> list[Triple]:
class ImportType(StrEnum):
OBSERVATIONS = "observations"
EVENTS = "events"


class TimePeriod(StrEnum):
DAY = "day"
MONTH = "month"
YEAR = "year"


class AggregationMethod(StrEnum):
COUNT = "count"


@dataclass
class AggregationConfig:
period: TimePeriod = TimePeriod.YEAR
method: AggregationMethod = AggregationMethod.COUNT

def __post_init__(self):
if self.period not in TimePeriod._member_map_.values():
raise ValueError(f"invalid period: {self.period}")
if self.method not in AggregationMethod._member_map_.values():
raise ValueError(f"invalid method: {self.method}")
67 changes: 60 additions & 7 deletions simple/stats/events_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,18 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime
import logging
import random

from dateutil.parser import parse as date_parse
import pandas as pd
from stats import constants
from stats.data import AggregationConfig
from stats.data import AggregationMethod
from stats.data import Event
from stats.data import Observation
from stats.data import TimePeriod
from stats.data import Triple
from stats.db import Db
from stats.importer import Importer
Expand Down Expand Up @@ -65,7 +71,6 @@ def do_import(self) -> None:
self._drop_ignored_columns()
self._sanitize_values()
self._resolve_entities()
self._parse_dates()
self._rename_columns()
self._write_event_triples()
self._write_observations()
Expand Down Expand Up @@ -115,13 +120,50 @@ def _rename_columns(self) -> None:

self.df = self.df.rename(columns=renamed)

def _parse_dates(self) -> None:
# TODO: parse dates and update value if needed (e.g. year, month, day).
pass

def _write_observations(self) -> None:
# TODO: compute aggregated observations and insert in DB.
pass
sv_names = self.config.computed_variables(self.input_file_name)
if not sv_names:
logging.warning("No computed variables specified: %s",
self.input_file_name)
return

for sv_name in sv_names:
sv_dcid = self.nodes.variable(sv_name, self.input_file_name).id
aggr_cfg = self.config.aggregation(sv_name)
observations = self._compute_sv_observations(sv_dcid, aggr_cfg)
self.db.insert_observations(observations)

def _compute_sv_observations(
self, sv_dcid: str, aggr_cfg: AggregationConfig = AggregationConfig()
) -> list[Observation]:
# Create df with only dcid and date columns.
obs_df = self.df.loc[:, [constants.COLUMN_DCID, constants.COLUMN_DATE]]

# Convert date to aggregation period
obs_df[constants.COLUMN_DATE] = obs_df[constants.COLUMN_DATE].apply(
lambda x: _time_period(x, aggr_cfg.period))

# Group by entity (dcid) and date, count each group and drop duplicates.
# NOTE: currently we only support count per entity and date.
# The groupby columns and transform functions will need to change
# when we add support for more aggregation methods (sum, average, etc.)
obs_df[constants.COLUMN_VALUE] = obs_df.groupby(
[constants.COLUMN_DCID,
constants.COLUMN_DATE])[constants.COLUMN_DCID].transform("count")
obs_df.drop_duplicates(inplace=True, ignore_index=True)

# Add variable and provenance columns.
obs_df[constants.COLUMN_VARIABLE] = sv_dcid
obs_df[constants.COLUMN_PROVENANCE] = self.provenance

# Reorder columns so they are in the same order as observations
obs_df = obs_df.reindex(columns=[
constants.COLUMN_DCID, constants.COLUMN_VARIABLE, constants.COLUMN_DATE,
constants.COLUMN_VALUE, constants.COLUMN_PROVENANCE
])

# Map each row to an Observation object and return the list of observations.
return [Observation(*row) for row in obs_df.itertuples(index=False)]

def _write_event_triples(self) -> None:
# Add event type node - it will be written to DB later.
Expand Down Expand Up @@ -259,3 +301,14 @@ def _write_debug_csvs(self) -> None:
self.debug_resolve_fh)
self.debug_resolve_fh.write_string(
self.debug_resolve_df.to_csv(index=False))


# Utility methods
def _time_period(date_str: str, period: TimePeriod) -> str:
date = date_parse(date_str)
if period == TimePeriod.DAY:
return date.strftime("%Y-%m-%d")
if period == TimePeriod.YEAR:
return date.strftime("%Y")
# Default to month
return date.strftime("%Y-%m")
10 changes: 8 additions & 2 deletions simple/stats/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,14 @@ def _add_provenance(self, sv: StatVar, provenance: Provenance) -> StatVar:
return sv

def _sv_id(self, sv_column_name: str) -> str:
if re.fullmatch(_DCID_PATTERN, sv_column_name):
return sv_column_name
dcid = sv_column_name
if re.fullmatch(_DCID_PATTERN, dcid):
return dcid
# Convert spaces and dashes to underscores and check if that
# is a valid DCID pattern
dcid = re.sub(r"[ -]", "_", dcid)
if re.fullmatch(_DCID_PATTERN, dcid):
return dcid
self._sv_generated_id_count += 1
return f"{_CUSTOM_SV_ID_PREFIX}{self._sv_generated_id_count}"

Expand Down
28 changes: 26 additions & 2 deletions simple/tests/stats/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@
import unittest

from stats.config import Config
from stats.data import AggregationConfig
from stats.data import AggregationMethod
from stats.data import ImportType
from stats.data import Provenance
from stats.data import Source
from stats.data import StatVar
from stats.data import TimePeriod

CONFIG_DATA = {
"inputFiles": {
Expand All @@ -42,10 +45,18 @@
},
"variables": {
"Variable 1": {
"group": "Parent Group/Child Group 1"
"group": "Parent Group/Child Group 1",
"aggregation": {
"period": "month",
"method": "count"
}
},
"Variable 2": {
"group": "Parent Group/Child Group 1"
"group": "Parent Group/Child Group 1",
"aggregation": {
"period": "INVALID",
"method": "count"
}
},
"var3": {
"name": "Var 3 Name",
Expand Down Expand Up @@ -170,6 +181,19 @@ def test_import_type(self):
with self.assertRaisesRegex(ValueError, "Unsupported import type"):
config.import_type("invalid_import_type.csv")

def test_aggregation(self):
config = Config(CONFIG_DATA)
self.assertEqual(
config.aggregation("Variable 1"),
AggregationConfig(TimePeriod.MONTH, AggregationMethod.COUNT),
"valid date config")
self.assertEqual(
config.aggregation("var3"),
AggregationConfig(TimePeriod.YEAR, AggregationMethod.COUNT),
"default date config")
with self.assertRaisesRegex(ValueError, "invalid period"):
config.aggregation("Variable 2")

def test_empty_config(self):
config = Config({})
self.assertEqual(config.variable("Variable 1"), StatVar("", "Variable 1"))
Expand Down
8 changes: 4 additions & 4 deletions simple/tests/stats/nodes_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_variable_with_no_config(self):
self.assertEqual(
sv,
StatVar(
"custom/statvar_1",
"Variable_with_no_config",
"Variable with no config",
group_id="custom/g/Root",
provenance_ids=["c/p/1"],
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_variable_with_group(self):
self.assertEqual(
sv,
StatVar(
"custom/statvar_1",
"Variable_1",
"Variable 1",
group_id="custom/g/group_2",
provenance_ids=["c/p/1"],
Expand Down Expand Up @@ -202,7 +202,7 @@ def test_multiple_variables_in_same_group(self):
self.assertEqual(
sv,
StatVar(
"custom/statvar_1",
"Variable_1",
"Variable 1",
group_id="custom/g/group_2",
provenance_ids=["c/p/1"],
Expand All @@ -213,7 +213,7 @@ def test_multiple_variables_in_same_group(self):
self.assertEqual(
sv,
StatVar(
"custom/statvar_2",
"Variable_2",
"Variable 2",
group_id="custom/g/group_2",
provenance_ids=["c/p/1"],
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@

entity,variable,date,value,provenance
country/USA,Count_CrimeEvent,2023,3,c/p/1
country/BRA,Count_CrimeEvent,2023,2,c/p/1
country/CHN,Count_CrimeEvent,2023,2,c/p/1
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,14 @@ c/p/1,typeOf,Provenance,
c/p/1,name,,Test Provenance
c/p/1,source,c/s/1,
c/p/1,url,,http://source.com/provenance
custom/g/Root,typeOf,StatVarGroup,
custom/g/Root,name,,Custom Variables
custom/g/Root,specializationOf,dc/g/Root,
Count_CrimeEvent,typeOf,StatisticalVariable,
Count_CrimeEvent,name,,Count_CrimeEvent
Count_CrimeEvent,memberOf,custom/g/Root,
Count_CrimeEvent,includedIn,c/p/1,
Count_CrimeEvent,includedIn,c/s/1,
CrimeEvent,typeOf,Class,
CrimeEvent,subClassOf,Event,
CrimeEvent,name,,Crime Event
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@

entity,variable,date,value,provenance
country/USA,Crime_Event2_Count,2023-11-08,2,c/p/1
country/BRA,Crime_Event2_Count,2023-11-08,2,c/p/1
country/CHN,Crime_Event2_Count,2023-11-08,1,c/p/1
country/CHN,Crime_Event2_Count,2023-09-17,1,c/p/1
country/USA,Crime_Event2_Count,2023-08-02,1,c/p/1
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,14 @@ c/p/1,typeOf,Provenance,
c/p/1,name,,Test Provenance
c/p/1,source,c/s/1,
c/p/1,url,,http://source.com/provenance
custom/g/Root,typeOf,StatVarGroup,
custom/g/Root,name,,Custom Variables
custom/g/Root,specializationOf,dc/g/Root,
Crime_Event2_Count,typeOf,StatisticalVariable,
Crime_Event2_Count,name,,Number of Crime2 Events
Crime_Event2_Count,memberOf,custom/g/Root,
Crime_Event2_Count,includedIn,c/p/1,
Crime_Event2_Count,includedIn,c/s/1,
CrimeEvent2,typeOf,Class,
CrimeEvent2,subClassOf,Event,
CrimeEvent2,name,,CrimeEvent2
Expand Down
Loading

0 comments on commit 93aa394

Please sign in to comment.