diff --git a/simple/sample/output/debug/observations.csv b/simple/sample/output/debug/observations.csv index baebe2a0..c5c9b240 100644 --- a/simple/sample/output/debug/observations.csv +++ b/simple/sample/output/debug/observations.csv @@ -21,10 +21,10 @@ country/ASM,var2,2023,34,c/p/1 country/AIA,var2,2023,42,c/p/1 country/WLF,var2,2023,75,c/p/1 country/ESH,var2,2023,65,c/p/1 -geoId/01,custom/statvar_1,2021,555,c/p/1 -geoId/122,custom/statvar_1,2022,321,c/p/1 -geoId/01,custom/statvar_2,2021,666,c/p/1 -geoId/122,custom/statvar_2,2022,123456,c/p/1 +geoId/01,Variable_1,2021,555,c/p/1 +geoId/122,Variable_1,2022,321,c/p/1 +geoId/01,Variable_2,2021,666,c/p/1 +geoId/122,Variable_2,2022,123456,c/p/1 country/USA,var1,2021,555,c/p/1 country/IND,var1,2022,321,c/p/1 country/USA,var2,2021,666,c/p/1 diff --git a/simple/sample/output/debug/triples.csv b/simple/sample/output/debug/triples.csv index 5dff3759..1c01d0ea 100644 --- a/simple/sample/output/debug/triples.csv +++ b/simple/sample/output/debug/triples.csv @@ -51,16 +51,16 @@ var2,memberOf,custom/g/group_3,"" var2,includedIn,c/p/1,"" var2,includedIn,c/p/2,"" var2,includedIn,c/s/1,"" -custom/statvar_1,typeOf,StatisticalVariable,"" -custom/statvar_1,name,"","Variable 1" -custom/statvar_1,memberOf,custom/g/group_1,"" -custom/statvar_1,includedIn,c/p/1,"" -custom/statvar_1,includedIn,c/s/1,"" -custom/statvar_2,typeOf,StatisticalVariable,"" -custom/statvar_2,name,"","Variable 2" -custom/statvar_2,memberOf,custom/g/Root,"" -custom/statvar_2,includedIn,c/p/1,"" -custom/statvar_2,includedIn,c/s/1,"" +Variable_1,typeOf,StatisticalVariable,"" +Variable_1,name,"","Variable 1" +Variable_1,memberOf,custom/g/group_1,"" +Variable_1,includedIn,c/p/1,"" +Variable_1,includedIn,c/s/1,"" +Variable_2,typeOf,StatisticalVariable,"" +Variable_2,name,"","Variable 2" +Variable_2,memberOf,custom/g/Root,"" +Variable_2,includedIn,c/p/1,"" +Variable_2,includedIn,c/s/1,"" country/AFG,typeOf,Country,"" country/YEM,typeOf,Country,"" country/AGO,typeOf,Country,"" diff --git a/simple/sample/output/nl/sentences.csv b/simple/sample/output/nl/sentences.csv index 1ae6188d..0f148100 100644 --- a/simple/sample/output/nl/sentences.csv +++ b/simple/sample/output/nl/sentences.csv @@ -1,5 +1,5 @@ dcid,sentence var1,Good var1 name;Good var1 description;Natural language sentence 1;Natural language sentence 2 var2,Good var2 name -custom/statvar_1,Variable 1 -custom/statvar_2,Variable 2 +Variable_1,Variable 1 +Variable_2,Variable 2 diff --git a/simple/sample/output/triples.csv b/simple/sample/output/triples.csv index 88085c8a..d2611fe9 100644 --- a/simple/sample/output/triples.csv +++ b/simple/sample/output/triples.csv @@ -51,16 +51,16 @@ var2,memberOf,custom/g/group_3, var2,includedIn,c/p/1, var2,includedIn,c/p/2, var2,includedIn,c/s/1, -custom/statvar_1,typeOf,StatisticalVariable, -custom/statvar_1,name,,Variable 1 -custom/statvar_1,memberOf,custom/g/group_1, -custom/statvar_1,includedIn,c/p/1, -custom/statvar_1,includedIn,c/s/1, -custom/statvar_2,typeOf,StatisticalVariable, -custom/statvar_2,name,,Variable 2 -custom/statvar_2,memberOf,custom/g/Root, -custom/statvar_2,includedIn,c/p/1, -custom/statvar_2,includedIn,c/s/1, +Variable_1,typeOf,StatisticalVariable, +Variable_1,name,,Variable 1 +Variable_1,memberOf,custom/g/group_1, +Variable_1,includedIn,c/p/1, +Variable_1,includedIn,c/s/1, +Variable_2,typeOf,StatisticalVariable, +Variable_2,name,,Variable 2 +Variable_2,memberOf,custom/g/Root, +Variable_2,includedIn,c/p/1, +Variable_2,includedIn,c/s/1, country/AFG,typeOf,Country, country/YEM,typeOf,Country, country/AGO,typeOf,Country, diff --git a/simple/stats/config.py b/simple/stats/config.py index 6ee62f7e..8449ca50 100644 --- a/simple/stats/config.py +++ b/simple/stats/config.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from stats.data import AggregationConfig from stats.data import EventType from stats.data import ImportType from stats.data import Provenance @@ -35,6 +36,8 @@ _EVENT_TYPE_FIELD = "eventType" _ID_COLUMN_FIELD = "idColumn" _EVENTS_FIELD = "events" +_COMPUTED_VARIABLES_FIELD = "computedVariables" +_AGGREGATION_FIELD = "aggregation" class Config: @@ -60,6 +63,9 @@ def import_type(self, input_file_name: str) -> ImportType: f"Unsupported import type: {import_type_str} ({input_file_name})") return ImportType(import_type_str) + def computed_variables(self, input_file_name: str) -> list[str]: + return self._input_file(input_file_name).get(_COMPUTED_VARIABLES_FIELD, []) + def variable(self, variable_name: str) -> StatVar: var_cfg = self.data.get(_VARIABLES_FIELD, {}).get(variable_name, {}) return StatVar( @@ -70,6 +76,12 @@ def variable(self, variable_name: str) -> StatVar: group_path=var_cfg.get(_GROUP_FIELD, ""), ) + def aggregation(self, variable_name: str) -> AggregationConfig: + aggregation_cfg = self.data.get(_VARIABLES_FIELD, {}) \ + .get(variable_name, {}) \ + .get(_AGGREGATION_FIELD, {}) + return AggregationConfig(**aggregation_cfg) + def event_type(self, input_file_name: str) -> str: return self._input_file(input_file_name).get(_EVENT_TYPE_FIELD, "") diff --git a/simple/stats/data.py b/simple/stats/data.py index a7cca4e4..55837d8e 100644 --- a/simple/stats/data.py +++ b/simple/stats/data.py @@ -267,3 +267,25 @@ def triples(self) -> list[Triple]: class ImportType(StrEnum): OBSERVATIONS = "observations" EVENTS = "events" + + +class TimePeriod(StrEnum): + DAY = "day" + MONTH = "month" + YEAR = "year" + + +class AggregationMethod(StrEnum): + COUNT = "count" + + +@dataclass +class AggregationConfig: + period: TimePeriod = TimePeriod.YEAR + method: AggregationMethod = AggregationMethod.COUNT + + def __post_init__(self): + if self.period not in TimePeriod._member_map_.values(): + raise ValueError(f"invalid period: {self.period}") + if self.method not in AggregationMethod._member_map_.values(): + raise ValueError(f"invalid method: {self.method}") diff --git a/simple/stats/events_importer.py b/simple/stats/events_importer.py index d78c10aa..95f9b7ef 100644 --- a/simple/stats/events_importer.py +++ b/simple/stats/events_importer.py @@ -12,12 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +from datetime import datetime import logging import random +from dateutil.parser import parse as date_parse import pandas as pd from stats import constants +from stats.data import AggregationConfig +from stats.data import AggregationMethod from stats.data import Event +from stats.data import Observation +from stats.data import TimePeriod from stats.data import Triple from stats.db import Db from stats.importer import Importer @@ -65,7 +71,6 @@ def do_import(self) -> None: self._drop_ignored_columns() self._sanitize_values() self._resolve_entities() - self._parse_dates() self._rename_columns() self._write_event_triples() self._write_observations() @@ -115,13 +120,50 @@ def _rename_columns(self) -> None: self.df = self.df.rename(columns=renamed) - def _parse_dates(self) -> None: - # TODO: parse dates and update value if needed (e.g. year, month, day). - pass - def _write_observations(self) -> None: - # TODO: compute aggregated observations and insert in DB. - pass + sv_names = self.config.computed_variables(self.input_file_name) + if not sv_names: + logging.warning("No computed variables specified: %s", + self.input_file_name) + return + + for sv_name in sv_names: + sv_dcid = self.nodes.variable(sv_name, self.input_file_name).id + aggr_cfg = self.config.aggregation(sv_name) + observations = self._compute_sv_observations(sv_dcid, aggr_cfg) + self.db.insert_observations(observations) + + def _compute_sv_observations( + self, sv_dcid: str, aggr_cfg: AggregationConfig = AggregationConfig() + ) -> list[Observation]: + # Create df with only dcid and date columns. + obs_df = self.df.loc[:, [constants.COLUMN_DCID, constants.COLUMN_DATE]] + + # Convert date to aggregation period + obs_df[constants.COLUMN_DATE] = obs_df[constants.COLUMN_DATE].apply( + lambda x: _time_period(x, aggr_cfg.period)) + + # Group by entity (dcid) and date, count each group and drop duplicates. + # NOTE: currently we only support count per entity and date. + # The groupby columns and transform functions will need to change + # when we add support for more aggregation methods (sum, average, etc.) + obs_df[constants.COLUMN_VALUE] = obs_df.groupby( + [constants.COLUMN_DCID, + constants.COLUMN_DATE])[constants.COLUMN_DCID].transform("count") + obs_df.drop_duplicates(inplace=True, ignore_index=True) + + # Add variable and provenance columns. + obs_df[constants.COLUMN_VARIABLE] = sv_dcid + obs_df[constants.COLUMN_PROVENANCE] = self.provenance + + # Reorder columns so they are in the same order as observations + obs_df = obs_df.reindex(columns=[ + constants.COLUMN_DCID, constants.COLUMN_VARIABLE, constants.COLUMN_DATE, + constants.COLUMN_VALUE, constants.COLUMN_PROVENANCE + ]) + + # Map each row to an Observation object and return the list of observations. + return [Observation(*row) for row in obs_df.itertuples(index=False)] def _write_event_triples(self) -> None: # Add event type node - it will be written to DB later. @@ -259,3 +301,14 @@ def _write_debug_csvs(self) -> None: self.debug_resolve_fh) self.debug_resolve_fh.write_string( self.debug_resolve_df.to_csv(index=False)) + + +# Utility methods +def _time_period(date_str: str, period: TimePeriod) -> str: + date = date_parse(date_str) + if period == TimePeriod.DAY: + return date.strftime("%Y-%m-%d") + if period == TimePeriod.YEAR: + return date.strftime("%Y") + # Default to month + return date.strftime("%Y-%m") diff --git a/simple/stats/nodes.py b/simple/stats/nodes.py index c0f39ffa..20f8fdcd 100644 --- a/simple/stats/nodes.py +++ b/simple/stats/nodes.py @@ -159,8 +159,14 @@ def _add_provenance(self, sv: StatVar, provenance: Provenance) -> StatVar: return sv def _sv_id(self, sv_column_name: str) -> str: - if re.fullmatch(_DCID_PATTERN, sv_column_name): - return sv_column_name + dcid = sv_column_name + if re.fullmatch(_DCID_PATTERN, dcid): + return dcid + # Convert spaces and dashes to underscores and check if that + # is a valid DCID pattern + dcid = re.sub(r"[ -]", "_", dcid) + if re.fullmatch(_DCID_PATTERN, dcid): + return dcid self._sv_generated_id_count += 1 return f"{_CUSTOM_SV_ID_PREFIX}{self._sv_generated_id_count}" diff --git a/simple/tests/stats/config_test.py b/simple/tests/stats/config_test.py index 5ba84c01..1f24c8f4 100644 --- a/simple/tests/stats/config_test.py +++ b/simple/tests/stats/config_test.py @@ -15,10 +15,13 @@ import unittest from stats.config import Config +from stats.data import AggregationConfig +from stats.data import AggregationMethod from stats.data import ImportType from stats.data import Provenance from stats.data import Source from stats.data import StatVar +from stats.data import TimePeriod CONFIG_DATA = { "inputFiles": { @@ -42,10 +45,18 @@ }, "variables": { "Variable 1": { - "group": "Parent Group/Child Group 1" + "group": "Parent Group/Child Group 1", + "aggregation": { + "period": "month", + "method": "count" + } }, "Variable 2": { - "group": "Parent Group/Child Group 1" + "group": "Parent Group/Child Group 1", + "aggregation": { + "period": "INVALID", + "method": "count" + } }, "var3": { "name": "Var 3 Name", @@ -170,6 +181,19 @@ def test_import_type(self): with self.assertRaisesRegex(ValueError, "Unsupported import type"): config.import_type("invalid_import_type.csv") + def test_aggregation(self): + config = Config(CONFIG_DATA) + self.assertEqual( + config.aggregation("Variable 1"), + AggregationConfig(TimePeriod.MONTH, AggregationMethod.COUNT), + "valid date config") + self.assertEqual( + config.aggregation("var3"), + AggregationConfig(TimePeriod.YEAR, AggregationMethod.COUNT), + "default date config") + with self.assertRaisesRegex(ValueError, "invalid period"): + config.aggregation("Variable 2") + def test_empty_config(self): config = Config({}) self.assertEqual(config.variable("Variable 1"), StatVar("", "Variable 1")) diff --git a/simple/tests/stats/nodes_test.py b/simple/tests/stats/nodes_test.py index fe1011a9..76517dd9 100644 --- a/simple/tests/stats/nodes_test.py +++ b/simple/tests/stats/nodes_test.py @@ -139,7 +139,7 @@ def test_variable_with_no_config(self): self.assertEqual( sv, StatVar( - "custom/statvar_1", + "Variable_with_no_config", "Variable with no config", group_id="custom/g/Root", provenance_ids=["c/p/1"], @@ -169,7 +169,7 @@ def test_variable_with_group(self): self.assertEqual( sv, StatVar( - "custom/statvar_1", + "Variable_1", "Variable 1", group_id="custom/g/group_2", provenance_ids=["c/p/1"], @@ -202,7 +202,7 @@ def test_multiple_variables_in_same_group(self): self.assertEqual( sv, StatVar( - "custom/statvar_1", + "Variable_1", "Variable 1", group_id="custom/g/group_2", provenance_ids=["c/p/1"], @@ -213,7 +213,7 @@ def test_multiple_variables_in_same_group(self): self.assertEqual( sv, StatVar( - "custom/statvar_2", + "Variable_2", "Variable 2", group_id="custom/g/group_2", provenance_ids=["c/p/1"], diff --git a/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.observations.db.csv b/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.observations.db.csv index 8b137891..4e678908 100644 --- a/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.observations.db.csv +++ b/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.observations.db.csv @@ -1 +1,4 @@ - +entity,variable,date,value,provenance +country/USA,Count_CrimeEvent,2023,3,c/p/1 +country/BRA,Count_CrimeEvent,2023,2,c/p/1 +country/CHN,Count_CrimeEvent,2023,2,c/p/1 diff --git a/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.triples.db.csv b/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.triples.db.csv index 0eb5278d..87fc661b 100644 --- a/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.triples.db.csv +++ b/simple/tests/stats/test_data/events_importer/expected/countryalpha3codes.triples.db.csv @@ -237,6 +237,14 @@ c/p/1,typeOf,Provenance, c/p/1,name,,Test Provenance c/p/1,source,c/s/1, c/p/1,url,,http://source.com/provenance +custom/g/Root,typeOf,StatVarGroup, +custom/g/Root,name,,Custom Variables +custom/g/Root,specializationOf,dc/g/Root, +Count_CrimeEvent,typeOf,StatisticalVariable, +Count_CrimeEvent,name,,Count_CrimeEvent +Count_CrimeEvent,memberOf,custom/g/Root, +Count_CrimeEvent,includedIn,c/p/1, +Count_CrimeEvent,includedIn,c/s/1, CrimeEvent,typeOf,Class, CrimeEvent,subClassOf,Event, CrimeEvent,name,,Crime Event diff --git a/simple/tests/stats/test_data/events_importer/expected/idcolumns.observations.db.csv b/simple/tests/stats/test_data/events_importer/expected/idcolumns.observations.db.csv index 8b137891..07363c24 100644 --- a/simple/tests/stats/test_data/events_importer/expected/idcolumns.observations.db.csv +++ b/simple/tests/stats/test_data/events_importer/expected/idcolumns.observations.db.csv @@ -1 +1,6 @@ - +entity,variable,date,value,provenance +country/USA,Crime_Event2_Count,2023-11-08,2,c/p/1 +country/BRA,Crime_Event2_Count,2023-11-08,2,c/p/1 +country/CHN,Crime_Event2_Count,2023-11-08,1,c/p/1 +country/CHN,Crime_Event2_Count,2023-09-17,1,c/p/1 +country/USA,Crime_Event2_Count,2023-08-02,1,c/p/1 diff --git a/simple/tests/stats/test_data/events_importer/expected/idcolumns.triples.db.csv b/simple/tests/stats/test_data/events_importer/expected/idcolumns.triples.db.csv index cf696f65..85cd0dde 100644 --- a/simple/tests/stats/test_data/events_importer/expected/idcolumns.triples.db.csv +++ b/simple/tests/stats/test_data/events_importer/expected/idcolumns.triples.db.csv @@ -237,6 +237,14 @@ c/p/1,typeOf,Provenance, c/p/1,name,,Test Provenance c/p/1,source,c/s/1, c/p/1,url,,http://source.com/provenance +custom/g/Root,typeOf,StatVarGroup, +custom/g/Root,name,,Custom Variables +custom/g/Root,specializationOf,dc/g/Root, +Crime_Event2_Count,typeOf,StatisticalVariable, +Crime_Event2_Count,name,,Number of Crime2 Events +Crime_Event2_Count,memberOf,custom/g/Root, +Crime_Event2_Count,includedIn,c/p/1, +Crime_Event2_Count,includedIn,c/s/1, CrimeEvent2,typeOf,Class, CrimeEvent2,subClassOf,Event, CrimeEvent2,name,,CrimeEvent2 diff --git a/simple/tests/stats/test_data/events_importer/input/config.json b/simple/tests/stats/test_data/events_importer/input/config.json index c043372d..e92f14c8 100644 --- a/simple/tests/stats/test_data/events_importer/input/config.json +++ b/simple/tests/stats/test_data/events_importer/input/config.json @@ -3,13 +3,15 @@ "countryalpha3codes.csv": { "eventType": "CrimeEvent", "entityType": "Country", - "provenance": "Test Provenance" + "provenance": "Test Provenance", + "computedVariables": ["Count_CrimeEvent"] }, "idcolumns.csv": { "eventType": "CrimeEvent2", "entityType": "Country", "provenance": "Test Provenance", - "idColumn": "CASE" + "idColumn": "CASE", + "computedVariables": ["Crime Event2 Count"] } }, "events": { @@ -18,6 +20,14 @@ "description": "Crime Event description" } }, + "variables": { + "Crime Event2 Count": { + "name": "Number of Crime2 Events", + "aggregation": { + "period": "day" + } + } + }, "sources": { "Test Source": { "url": "http://source.com", diff --git a/simple/tests/stats/test_data/nodes/expected/triples.csv b/simple/tests/stats/test_data/nodes/expected/triples.csv index 4ff30b47..37914998 100644 --- a/simple/tests/stats/test_data/nodes/expected/triples.csv +++ b/simple/tests/stats/test_data/nodes/expected/triples.csv @@ -33,27 +33,27 @@ custom/g/group_3,includedIn,c/s/default, custom/g/Root,typeOf,StatVarGroup, custom/g/Root,name,,Custom Variables custom/g/Root,specializationOf,dc/g/Root, -custom/statvar_1,typeOf,StatisticalVariable, -custom/statvar_1,name,,Variable 1 -custom/statvar_1,memberOf,custom/g/group_2, -custom/statvar_1,includedIn,c/p/1, -custom/statvar_1,includedIn,c/s/1, -custom/statvar_2,typeOf,StatisticalVariable, -custom/statvar_2,name,,Variable 2 -custom/statvar_2,memberOf,custom/g/group_2, -custom/statvar_2,includedIn,c/p/1, -custom/statvar_2,includedIn,c/s/1, +Variable_1,typeOf,StatisticalVariable, +Variable_1,name,,Variable 1 +Variable_1,memberOf,custom/g/group_2, +Variable_1,includedIn,c/p/1, +Variable_1,includedIn,c/s/1, +Variable_2,typeOf,StatisticalVariable, +Variable_2,name,,Variable 2 +Variable_2,memberOf,custom/g/group_2, +Variable_2,includedIn,c/p/1, +Variable_2,includedIn,c/s/1, var3,typeOf,StatisticalVariable, var3,name,,Var 3 Name var3,description,,Var 3 Description var3,memberOf,custom/g/group_3, var3,includedIn,c/p/default, var3,includedIn,c/s/default, -custom/statvar_3,typeOf,StatisticalVariable, -custom/statvar_3,name,,Variable with no config -custom/statvar_3,memberOf,custom/g/Root, -custom/statvar_3,includedIn,c/p/default, -custom/statvar_3,includedIn,c/s/default, +Variable_with_no_config,typeOf,StatisticalVariable, +Variable_with_no_config,name,,Variable with no config +Variable_with_no_config,memberOf,custom/g/Root, +Variable_with_no_config,includedIn,c/p/default, +Variable_with_no_config,includedIn,c/s/default, CrimeEvent,typeOf,Class, CrimeEvent,subClassOf,Event, CrimeEvent,name,,Crime Event