diff --git a/simple/stats/README.md b/simple/stats/README.md index 1b3d22a6..e1db8aa7 100644 --- a/simple/stats/README.md +++ b/simple/stats/README.md @@ -16,6 +16,8 @@ The first 2 columns of input CSVs should be place names (or more generically _en The output `observations.csv` can be imported directly into sqlite. A sample output CSV can be found [here](sample/countries/observations.csv). +The program also outputs a `debug_resolve.csv` file. This is for debugging whether names were resolved to the correct DCIDs and addressed any unresolved ones. A sample CSV can be found [here](sample/countries/debug_resolve.csv). + ## Other options To see all parameters and overrides supported by the script: diff --git a/simple/stats/constants.py b/simple/stats/constants.py index e7aa7da7..b86f9592 100644 --- a/simple/stats/constants.py +++ b/simple/stats/constants.py @@ -16,13 +16,24 @@ # Defaults. DEFAULT_DATA_DIR = ".data" -DEFAULT_INPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "input") +DEFAULT_INPUT_PATH = os.path.join(DEFAULT_DATA_DIR, "input") DEFAULT_OUTPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "output") OBSERVATIONS_FILE_NAME = "observations.csv" +DEBUG_RESOLVE_FILE_NAME = "debug_resolve.csv" # Observations CSV columns. COLUMN_DCID = "dcid" COLUMN_VARIABLE = "variable" COLUMN_DATE = "date" COLUMN_VALUE = "value" + +# Debug CSV columns and values +DEBUG_COLUMN_NAME = "name" +DEBUG_COLUMN_DCID = "dcid" +DEBUG_COLUMN_LINK = "link" +DEBUG_UNRESOLVED_DCID = "*UNRESOLVED*" + +# DC links +DC_HOME = "https://datacommons.org" +DC_BROWSER = "https://datacommons.org/browser" diff --git a/simple/stats/importer.py b/simple/stats/importer.py index ed4122cc..0c9d20aa 100644 --- a/simple/stats/importer.py +++ b/simple/stats/importer.py @@ -28,14 +28,17 @@ # TODO: Add support for units. class SimpleStatsImporter: - def __init__(self, input_dir: str, output_dir: str, + def __init__(self, input_path: str, output_dir: str, entity_type: str) -> None: - self.input_dir = input_dir + self.input_path = input_path self.output_dir = output_dir self.observations_file = os.path.join(output_dir, constants.OBSERVATIONS_FILE_NAME) + self.debug_resolve_file = os.path.join( + output_dir, constants.DEBUG_RESOLVE_FILE_NAME) self.entity_type = entity_type self.df = pd.DataFrame() + self.debug_resolve_df = None def do_import(self) -> None: self._init() @@ -44,21 +47,29 @@ def do_import(self) -> None: self._resolve_entities() self._unpivot_variables() self._reorder_columns() - self._write_csv() + self._write_csvs() def _init(self): os.makedirs(self.output_dir, exist_ok=True) def _read_csvs(self) -> None: + if os.path.isdir(self.input_path): + self.df = SimpleStatsImporter._read_csvs_from_dir(self.input_path) + else: + self.df = pd.read_csv(self.input_path) + + logging.info("Read %s rows.", self.df.index.size) + + @staticmethod + def _read_csvs_from_dir(input_dir: str) -> pd.DataFrame: files = [ - os.path.join(self.input_dir, filename) - for filename in os.listdir(self.input_dir) + os.path.join(input_dir, filename) + for filename in os.listdir(input_dir) ] df = pd.DataFrame() for file in files: df = pd.concat([df, pd.read_csv(file)]) - logging.info("Read %s rows.", df.index.size) - self.df = df + return df def _rename_columns(self) -> None: df = self.df @@ -77,13 +88,40 @@ def _resolve_entities(self) -> None: logging.info("Resolved %s of %s entities.", len(dcids), len(entities)) column.replace(dcids, inplace=True) unresolved = set(entities).difference(set(dcids.keys())) - if unresolved: - unresolved_list = list(unresolved) + unresolved_list = list(unresolved) + if unresolved_list: logging.warning("# unresolved entities which will be dropped: %s", len(unresolved_list)) logging.warning("Dropped entities: %s", unresolved_list) - df.drop(df[df.iloc[:, 0].isin(values=unresolved)].index, + df.drop(df[df.iloc[:, 0].isin(values=unresolved_list)].index, inplace=True) + self._create_debug_resolve_dataframe(resolved=dcids, + unresolved=unresolved_list) + + def _create_debug_resolve_dataframe(self, resolved: dict[str, str], + unresolved: list[str]): + # Add unresolved names first + names = unresolved[:] + dcids = [constants.DEBUG_UNRESOLVED_DCID] * len(unresolved) + + # Add resolved names and dcids + names.extend(list(resolved.keys())) + dcids.extend(list(resolved.values())) + + # Create browser links + links = [] + for dcid in dcids: + if dcid == constants.DEBUG_UNRESOLVED_DCID: + links.append("") + else: + links.append(f"{constants.DC_BROWSER}/{dcid}") + + # Create dataframe + self.debug_resolve_df = pd.DataFrame({ + constants.DEBUG_COLUMN_NAME: names, + constants.DEBUG_COLUMN_DCID: dcids, + constants.DEBUG_COLUMN_LINK: links, + }) def _unpivot_variables(self) -> None: self.df = self.df.melt( @@ -100,7 +138,11 @@ def _reorder_columns(self) -> None: constants.COLUMN_VALUE, ]) - def _write_csv(self) -> None: + def _write_csvs(self) -> None: logging.info("Writing %s observations to: %s", self.df.index.size, self.observations_file) self.df.to_csv(self.observations_file, index=False) + if self.debug_resolve_df is not None: + logging.info("Writing resolutions (for debugging) to: %s", + self.debug_resolve_file) + self.debug_resolve_df.to_csv(self.debug_resolve_file, index=False) diff --git a/simple/stats/main.py b/simple/stats/main.py index 86de36d7..e91ee933 100644 --- a/simple/stats/main.py +++ b/simple/stats/main.py @@ -32,15 +32,15 @@ None, "The type of entities in the CSV (e.g. 'City', 'Country', 'Company', etc.).", ) -flags.DEFINE_string("input_dir", constants.DEFAULT_INPUT_DIR, - "The input directory.") +flags.DEFINE_string("input_path", constants.DEFAULT_INPUT_PATH, + "The input directory or file.") flags.DEFINE_string("output_dir", constants.DEFAULT_OUTPUT_DIR, "The output directory.") def main(_): importer = SimpleStatsImporter( - input_dir=FLAGS.input_dir, + input_path=FLAGS.input_path, output_dir=FLAGS.output_dir, entity_type=FLAGS.entity_type, ) diff --git a/simple/stats/sample/countries/debug_resolve.csv b/simple/stats/sample/countries/debug_resolve.csv new file mode 100644 index 00000000..c2bb70d4 --- /dev/null +++ b/simple/stats/sample/countries/debug_resolve.csv @@ -0,0 +1,15 @@ +name,dcid,link +West Bank and Gaza,*UNRESOLVED*, +Cabo Verde,*UNRESOLVED*, +Afghanistan,country/AFG,https://datacommons.org/browser/country/AFG +Albania,country/ALB,https://datacommons.org/browser/country/ALB +Algeria,country/DZA,https://datacommons.org/browser/country/DZA +American Samoa,country/ASM,https://datacommons.org/browser/country/ASM +Andorra,country/AND,https://datacommons.org/browser/country/AND +Angola,country/AGO,https://datacommons.org/browser/country/AGO +Anguilla,country/AIA,https://datacommons.org/browser/country/AIA +Wallis and Futuna Islands,country/WLF,https://datacommons.org/browser/country/WLF +Western Sahara,country/ESH,https://datacommons.org/browser/country/ESH +Yemen,country/YEM,https://datacommons.org/browser/country/YEM +Zambia,country/ZMB,https://datacommons.org/browser/country/ZMB +Zimbabwe,country/ZWE,https://datacommons.org/browser/country/ZWE diff --git a/simple/stats/sample/powerplants/debug_resolve.csv b/simple/stats/sample/powerplants/debug_resolve.csv new file mode 100644 index 00000000..ab4bcc44 --- /dev/null +++ b/simple/stats/sample/powerplants/debug_resolve.csv @@ -0,0 +1,15 @@ +name,dcid,link +FOO BAR,*UNRESOLVED*, +BAZ BAR,*UNRESOLVED*, +Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93 +Crete Energy Venture,dc/009cxnrd9h8x6,https://datacommons.org/browser/dc/009cxnrd9h8x6 +Watchtower Educational Center,dc/00d76gnyx8p7b,https://datacommons.org/browser/dc/00d76gnyx8p7b +Union Power,dc/00jy62n5m9bt9,https://datacommons.org/browser/dc/00jy62n5m9bt9 +Pearl Station,dc/00w9rbw8yn7x7,https://datacommons.org/browser/dc/00w9rbw8yn7x7 +Austin Gas Recovery,dc/00zjgb4rjchx3,https://datacommons.org/browser/dc/00zjgb4rjchx3 +Gordon,dc/011s19rm0mzh1,https://datacommons.org/browser/dc/011s19rm0mzh1 +White River Lock and Dam 2,dc/017y3py1dzkmg,https://datacommons.org/browser/dc/017y3py1dzkmg +Bristol Plant,dc/01blq25mdxzs5,https://datacommons.org/browser/dc/01blq25mdxzs5 +Edison Sault,dc/01xe39q7j5x45,https://datacommons.org/browser/dc/01xe39q7j5x45 +Navajo Dam,dc/02b53twnh3fx,https://datacommons.org/browser/dc/02b53twnh3fx +CNN Center,dc/0lh5h07dsvl23,https://datacommons.org/browser/dc/0lh5h07dsvl23