From 350ecb7f0a80e800fdb59919953fcde3beec5eac Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Mon, 1 Aug 2022 17:42:25 +0200 Subject: [PATCH 1/3] [#43] Cache downloaded data; Implement feature with decorator; added tests --- .pylintrc | 2 +- nb/download_cubefile.ipynb | 201 ++++++++++++++++++++++++++++++++++++ nb/download_tablefile.ipynb | 14 +-- nb/parse_cube.ipynb | 168 +++++++++++------------------- src/pygenesis/cache.py | 52 ++++++++++ src/pygenesis/config.py | 4 + src/pygenesis/cube.py | 135 ++++++++++++------------ src/pygenesis/table.py | 21 ++-- tests/test_cache.py | 92 +++++++++++++++++ tests/test_config.py | 8 +- 10 files changed, 502 insertions(+), 195 deletions(-) create mode 100644 nb/download_cubefile.ipynb create mode 100644 src/pygenesis/cache.py create mode 100644 tests/test_cache.py diff --git a/.pylintrc b/.pylintrc index 8d6a9b1..4deabf8 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,7 +12,7 @@ ignore=third_party # Files or directories matching the regex patterns are skipped. The regex # matches against base names, not paths. -ignore-patterns= +ignore-patterns=test_.* # Pickle collected data for later comparisons. persistent=no diff --git a/nb/download_cubefile.ipynb b/nb/download_cubefile.ipynb new file mode 100644 index 0000000..f72c13d --- /dev/null +++ b/nb/download_cubefile.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "69e1d305", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ff9eca4f", + "metadata": {}, + "outputs": [], + "source": [ + "# only if you get an error from below\n", + "# from pygenesis import init_config\n", + "# init_config()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4a207a77", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from pygenesis.cube import get_cubefile_data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6e6df177", + "metadata": {}, + "outputs": [], + "source": [ + "params = {\"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"}\n", + "data = get_cubefile_data(name=\"47414BJ002\", **params)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a8bcd5b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DINSGWZ08N7WERTE4JAHRUMS103QUALITAETGESPERRTWERT-VERFAELSCHT
0DGWZ08-49-01NOMINAL2015100.0eNaN0.0
1DGWZ08-49-01NOMINAL201699.3eNaN0.0
2DGWZ08-49-01NOMINAL2017105.7eNaN0.0
3DGWZ08-49-01NOMINAL2018111.6eNaN0.0
4DGWZ08-49-01NOMINAL2019115.6eNaN0.0
\n", + "
" + ], + "text/plain": [ + " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT \\\n", + "0 DG WZ08-49-01 NOMINAL 2015 100.0 e NaN \n", + "1 DG WZ08-49-01 NOMINAL 2016 99.3 e NaN \n", + "2 DG WZ08-49-01 NOMINAL 2017 105.7 e NaN \n", + "3 DG WZ08-49-01 NOMINAL 2018 111.6 e NaN \n", + "4 DG WZ08-49-01 NOMINAL 2019 115.6 e NaN \n", + "\n", + " WERT-VERFAELSCHT \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fed610c9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nb/download_tablefile.ipynb b/nb/download_tablefile.ipynb index 8cef7a0..5c4fe4d 100644 --- a/nb/download_tablefile.ipynb +++ b/nb/download_tablefile.ipynb @@ -14,15 +14,7 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Error while loading the config file. Could not find C:\\Users\\micha\\AppData\\Local\\Temp\\pytest-of-micha\\pytest-78\\.pygenesis3\\config.ini. Please make sure to run init_config() first. \n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "from pygenesis.table import get_tablefile_data" @@ -44,13 +36,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [], "source": [ - "data = get_tablefile_data(\"61111-0002\", table_area=all)" + "data = get_tablefile_data(name=\"61111-0002\", table_area=all)" ] }, { diff --git a/nb/parse_cube.ipynb b/nb/parse_cube.ipynb index e2fa37b..6c84c2e 100644 --- a/nb/parse_cube.ipynb +++ b/nb/parse_cube.ipynb @@ -16,7 +16,7 @@ "metadata": {}, "outputs": [], "source": [ - "# only if you get an error from above\n", + "# only if you get an error from below\n", "# from pygenesis import init_config\n", "# init_config()" ] @@ -29,8 +29,8 @@ "source": [ "import pandas as pd\n", "\n", - "from pygenesis.destatis import get_cubefile\n", - "from pygenesis.cube import parse_cube, rename_axes" + "from pygenesis.cube import parse_cube, rename_axes\n", + "from pygenesis.http_helper import get_response_from_endpoint" ] }, { @@ -53,23 +53,16 @@ "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "D:\\git\\correlaid\\genesis-python\\.venv\\lib\\site-packages\\urllib3\\connectionpool.py:1043: InsecureRequestWarning: Unverified HTTPS request is being made to host 'www-genesis.destatis.de'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ - "data = get_cubefile({\"name\": \"47414BJ002\", \"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"})" + "params = {\"name\": \"47414BJ002\", \"area\": \"all\", \"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"}\n", + "response = get_response_from_endpoint(\"data\", \"cubefile\", params)\n", + "data = response.text" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -78,7 +71,7 @@ "(str, 79264)" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -89,13 +82,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['* Der Benutzer DEI6I4B3UW der Benutzergruppe DE0142 hat am 30.07.2022 um 20:43:34 diesen Export angestossen.',\n", + "['* Der Benutzer DEI6I4B3UW der Benutzergruppe DE0142 hat am 01.08.2022 um 08:16:00 diesen Export angestossen.',\n", " 'K;DQ;FACH-SCHL;GHH-ART;GHM-WERTE-JN;GENESIS-VBD;REGIOSTAT;EU-VBD;\"mit Werten\"',\n", " 'D;47414BJ002;;N;N;N;N',\n", " 'K;DQ-ERH;FACH-SCHL',\n", @@ -117,7 +110,7 @@ " 'D;DG;WZ08-49-01;NOMINAL;2020;96.0;e;;0.0']" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -137,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -146,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -164,36 +157,23 @@ " 0 JAHR 4 4,\n", " 'DQI': NAME ME-NAME DST TYP NKM-STELLEN GHH-ART GHM-WERTE-JN\n", " 0 UMS103 2015=100 FEST PROZENT 1 N,\n", - " 'QEI': FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT \\\n", - " 0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n", - " 1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n", - " 2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n", - " 3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n", - " 4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n", - " ... ... ... ... ... ... ... ... \n", - " 2018 DG WZ08-N REAL 2017 108.4 e \n", - " 2019 DG WZ08-N REAL 2018 110.6 e \n", - " 2020 DG WZ08-N REAL 2019 110.8 e \n", - " 2021 DG WZ08-N REAL 2020 94.1 e \n", - " 2022 DG WZ08-N REAL 2021 101.2 p \n", - " \n", - " WERT-VERFAELSCHT \n", - " 0 0.0 \n", - " 1 0.0 \n", - " 2 0.0 \n", - " 3 0.0 \n", - " 4 0.0 \n", - " ... ... \n", - " 2018 0.0 \n", - " 2019 0.0 \n", - " 2020 0.0 \n", - " 2021 0.0 \n", - " 2022 0.0 \n", + " 'QEI': FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT WERT-VERFAELSCHT\n", + " 0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n", + " 1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n", + " 2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n", + " 3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n", + " 4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n", + " ... ... ... ... ... ... ... ... ...\n", + " 2018 DG WZ08-N REAL 2017 108.4 e 0.0\n", + " 2019 DG WZ08-N REAL 2018 110.6 e 0.0\n", + " 2020 DG WZ08-N REAL 2019 110.8 e 0.0\n", + " 2021 DG WZ08-N REAL 2020 94.1 e 0.0\n", + " 2022 DG WZ08-N REAL 2021 101.2 p 0.0\n", " \n", " [2023 rows x 8 columns]}" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -204,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -366,36 +346,23 @@ "" ], "text/plain": [ - " FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT \\\n", - "0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n", - "1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n", - "2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n", - "3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n", - "4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n", - "... ... ... ... ... ... ... ... \n", - "2018 DG WZ08-N REAL 2017 108.4 e \n", - "2019 DG WZ08-N REAL 2018 110.6 e \n", - "2020 DG WZ08-N REAL 2019 110.8 e \n", - "2021 DG WZ08-N REAL 2020 94.1 e \n", - "2022 DG WZ08-N REAL 2021 101.2 p \n", - "\n", - " WERT-VERFAELSCHT \n", - "0 0.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 0.0 \n", - "... ... \n", - "2018 0.0 \n", - "2019 0.0 \n", - "2020 0.0 \n", - "2021 0.0 \n", - "2022 0.0 \n", + " FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT WERT-VERFAELSCHT\n", + "0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n", + "1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n", + "2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n", + "3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n", + "4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n", + "... ... ... ... ... ... ... ... ...\n", + "2018 DG WZ08-N REAL 2017 108.4 e 0.0\n", + "2019 DG WZ08-N REAL 2018 110.6 e 0.0\n", + "2020 DG WZ08-N REAL 2019 110.8 e 0.0\n", + "2021 DG WZ08-N REAL 2020 94.1 e 0.0\n", + "2022 DG WZ08-N REAL 2021 101.2 p 0.0\n", "\n", "[2023 rows x 8 columns]" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -406,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -444,13 +411,13 @@ " \n", " \n", " 1\n", - " ALT041\n", + " WZ08N7\n", " 2\n", " 2\n", " \n", " \n", " 2\n", - " FAMST2\n", + " WERTE4\n", " 3\n", " 3\n", " \n", @@ -461,11 +428,11 @@ "text/plain": [ " NAME RHF-BSR RHF-ACHSE\n", "0 DINSG 1 1\n", - "1 ALT041 2 2\n", - "2 FAMST2 3 3" + "1 WZ08N7 2 2\n", + "2 WERTE4 3 3" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -485,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -647,36 +614,23 @@ "" ], "text/plain": [ - " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT \\\n", - "0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n", - "1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n", - "2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n", - "3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n", - "4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n", - "... ... ... ... ... ... ... ... \n", - "2018 DG WZ08-N REAL 2017 108.4 e \n", - "2019 DG WZ08-N REAL 2018 110.6 e \n", - "2020 DG WZ08-N REAL 2019 110.8 e \n", - "2021 DG WZ08-N REAL 2020 94.1 e \n", - "2022 DG WZ08-N REAL 2021 101.2 p \n", - "\n", - " WERT-VERFAELSCHT \n", - "0 0.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 0.0 \n", - "... ... \n", - "2018 0.0 \n", - "2019 0.0 \n", - "2020 0.0 \n", - "2021 0.0 \n", - "2022 0.0 \n", + " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT WERT-VERFAELSCHT\n", + "0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n", + "1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n", + "2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n", + "3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n", + "4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n", + "... ... ... ... ... ... ... ... ...\n", + "2018 DG WZ08-N REAL 2017 108.4 e 0.0\n", + "2019 DG WZ08-N REAL 2018 110.6 e 0.0\n", + "2020 DG WZ08-N REAL 2019 110.8 e 0.0\n", + "2021 DG WZ08-N REAL 2020 94.1 e 0.0\n", + "2022 DG WZ08-N REAL 2021 101.2 p 0.0\n", "\n", "[2023 rows x 8 columns]" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } diff --git a/src/pygenesis/cache.py b/src/pygenesis/cache.py new file mode 100644 index 0000000..ca5372e --- /dev/null +++ b/src/pygenesis/cache.py @@ -0,0 +1,52 @@ +"""Module provides functions/decorators to cache downloaded data.""" +import logging +from datetime import date +from functools import wraps +from pathlib import Path +from typing import Callable + +import pandas as pd + +from pygenesis.config import load_config + +logger = logging.getLogger(__name__) + + +def cache_data(func: Callable) -> Callable: + """Store downloaded data on disk with download time as parent folder. + + Args: + func (Callable): One of the data methods of the data endpoint. + """ + + @wraps(func) + def wrapper_func(**kwargs): + config = load_config() + cache_dir = Path(config["DATA"]["cache_dir"]) + + if not cache_dir.is_dir() or not cache_dir.exists(): + logger.critical( + "Cache dir does not exist! Please make sure init_config() was run properly. Path: %s", + cache_dir, + ) + + name = kwargs["name"] + data_dir = cache_dir / name + if data_dir.exists(): + # TODO: Implement solution for updated data. + # So don't return latest version but check first for newer version in GENESIS. + # if data_dir exists, there has to be at least one stored version of this data + versions = sorted((p.name for p in data_dir.glob("*")), key=int) + latest = versions[-1] + data = pd.read_csv(data_dir / latest / f"{name}.xz") + else: + data: pd.DateFrame = func(**kwargs) + file_path = ( + data_dir / str(date.today()).replace("-", "") / f"{name}.xz" + ) + file_path.parent.mkdir(parents=True, exist_ok=True) + data.to_csv(file_path, index=False) + + return data + + return wrapper_func diff --git a/src/pygenesis/config.py b/src/pygenesis/config.py index 9018a68..8f48d2b 100644 --- a/src/pygenesis/config.py +++ b/src/pygenesis/config.py @@ -77,6 +77,10 @@ def init_config(config_dir: Path = DEFAULT_CONFIG_DIR) -> None: config = _create_default_config() _write_config(config, config_file) + cache_dir = Path(config["DATA"]["cache_dir"]) + if not cache_dir.exists(): + cache_dir.mkdir() + logger.info("New config was created. Path: %s.", config_file) diff --git a/src/pygenesis/cube.py b/src/pygenesis/cube.py index 9df44ee..0015b8b 100644 --- a/src/pygenesis/cube.py +++ b/src/pygenesis/cube.py @@ -3,78 +3,41 @@ import pandas as pd +from pygenesis.cache import cache_data +from pygenesis.http_helper import get_response_from_endpoint -def is_cube_metadata_header(line: str) -> bool: - """Check if a line is a cube metadata header. - Args: - line (str): A single line of a cubefile. - - Returns: - bool: True if the line starts with a "K", False otherwise. - """ - return line[0] == "K" +@cache_data +def get_cubefile_data( + *, name: str, area: str = "all", **kwargs +) -> pd.DataFrame: + """Return cube file data as pandas data frame. - -def get_cube_metadata_header_type(line: str) -> str: - """Return the header type. + Based on the cube name, cube area and additional query parameters the + cubefile method from the data-endpoint will be queried. Args: - line (str): A single line of a cubefile. + name (str): Name of the cube. + area (str, optional): Area of the cube. Defaults to "all". Returns: - str: The header type, which is the second entry in the header. + pd.DataFrame: Parsed cube file. """ - return line.split(";")[1] - - -def get_cube_metadata_header( - line: str, rename_duplicates: bool = False -) -> list[str]: - """Return the metadata header of a cubefile. - - Args: - line (str): A single line of a cubefile. - rename_duplicates (bool, optional): If False, the raw header is returned. - If True, identical column names are appended with a unique counter. - Defaults to False. - - Returns: - list[str]: A list of column names, except for "nur Werte" and "mit Werten". - """ - raw_header = line.split(";")[2:] - raw_header = [ - name - for name in raw_header - if name not in ['"nur Werte"', '"mit Werten"'] - ] + kwargs = kwargs or {} - if not rename_duplicates: - return raw_header + params = { + "name": name, + "area": area, + "format": "csv", + } - # header can have multiple entries with same label, which is problematic for pandas - # so lets just add a counter - header = [""] * len(raw_header) - for name in set(raw_header): - if raw_header.count(name) == 1: - header[raw_header.index(name)] = name - else: - for counter in range(raw_header.count(name)): - header[raw_header.index(name) + counter] = f"{name}-{counter+1}" + params |= kwargs - return header + response = get_response_from_endpoint("data", "cubefile", params) + cube_data = response.text + cube = rename_axes(parse_cube(cube_data)) - -def parse_cube_data_line(line: str) -> list[str]: - """Return the content of a cube data line. - - Args: - line (str): A single line of a cubefile. - - Returns: - list[str]: The content of a cube data line, omitting the first element. - """ - return line.split(";")[1:] + return cube["QEI"] def parse_cube(data: str) -> dict: @@ -92,19 +55,19 @@ def parse_cube(data: str) -> dict: for line in data.splitlines(): # skip all rows until first header - if header is None and not is_cube_metadata_header(line): + if header is None and not _is_cube_metadata_header(line): continue - if is_cube_metadata_header(line): + if _is_cube_metadata_header(line): if data_block: cube[header_type] = pd.DataFrame(data_block, columns=header) - header = get_cube_metadata_header(line, rename_duplicates=True) - header_type: str = get_cube_metadata_header_type(line) + header = _get_cube_metadata_header(line, rename_duplicates=True) + header_type: str = _get_cube_metadata_header_type(line) data_block = [] continue - line_content = parse_cube_data_line(line) + line_content = _parse_cube_data_line(line) data_block.append(line_content) # the last data block has no header after it so we have to do it here @@ -157,3 +120,45 @@ def rename_axes( cube["QEI"].rename(columns=dict(zip(old_cols, new_cols)), inplace=True) return cube + + +def _is_cube_metadata_header(line: str) -> bool: + """Check if a line is a cube metadata header.""" + return line[0] == "K" + + +def _get_cube_metadata_header_type(line: str) -> str: + """Return the header type.""" + return line.split(";")[1] + + +def _get_cube_metadata_header( + line: str, rename_duplicates: bool = False +) -> list[str]: + """Return the metadata header of a cubefile.""" + raw_header = line.split(";")[2:] + raw_header = [ + name + for name in raw_header + if name not in ['"nur Werte"', '"mit Werten"'] + ] + + if not rename_duplicates: + return raw_header + + # header can have multiple entries with same label, which is problematic for pandas + # so lets just add a counter + header = [""] * len(raw_header) + for name in set(raw_header): + if raw_header.count(name) == 1: + header[raw_header.index(name)] = name + else: + for counter in range(raw_header.count(name)): + header[raw_header.index(name) + counter] = f"{name}-{counter+1}" + + return header + + +def _parse_cube_data_line(line: str) -> list[str]: + """Return the content of a cube data line.""" + return line.split(";")[1:] diff --git a/src/pygenesis/table.py b/src/pygenesis/table.py index af9b3c4..c2e4a20 100644 --- a/src/pygenesis/table.py +++ b/src/pygenesis/table.py @@ -1,35 +1,38 @@ """Module contains business logic related to destatis tables.""" import pandas as pd +from pygenesis.cache import cache_data from pygenesis.csv_helper import get_df_from_text from pygenesis.http_helper import get_response_from_endpoint +@cache_data def get_tablefile_data( - table_name: str, table_area: str = "all", **kwargs + *, name: str, area: str = "all", **kwargs ) -> pd.DataFrame: - """ + """Return table file data as pandas data frame. + Based on the table name, table area and additional query parameters the tablefile method from the data-endpoint will be queried. Args: - table_name (str): Name of the table - table_area (str, optional): Area of the table (Defaul: all) - query_params (dict, optional): Additional query parameters - (Default: None) + name (str): Name of the table. + area (str, optional): Area of the table. Defaults to "all". + Returns: - pd.DataFrame + pd.DataFrame: Parsed table file. """ kwargs = kwargs or {} params = { - "name": table_name, - "area": table_area, + "name": name, + "area": area, "format": "ffcsv", } params |= kwargs response = get_response_from_endpoint("data", "tablefile", params) + return get_df_from_text(response.text) diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..919436e --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,92 @@ +import time +from datetime import date +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from pygenesis.cache import cache_data +from pygenesis.config import ( + DEFAULT_SETTINGS_FILE, + _write_config, + init_config, + load_settings, +) + +SLEEP_TIME = 0.1 + + +@pytest.fixture() +def cache_dir(tmp_path_factory): + return tmp_path_factory.mktemp(".pygenesis") + + +@pytest.fixture(autouse=True) +def restore_settings(): + old_settings = load_settings() + yield + _write_config(old_settings, DEFAULT_SETTINGS_FILE) + + +@cache_data +def decorated_data(*, name): + time.sleep(SLEEP_TIME) + return pd.DataFrame( + np.random.random(size=(10, 5)), columns=["a", "b", "c", "d", "e"] + ) + + +def test_cache_data_wrapper(cache_dir): + init_config(cache_dir) + + assert len(list((cache_dir / "data").glob("*"))) == 0 + + data = decorated_data(name="test_cache_decorator") + + assert isinstance(data, pd.DataFrame) + assert not data.empty + + cached_data_file: Path = ( + cache_dir + / "data" + / "test_cache_decorator" + / str(date.today()).replace("-", "") + / "test_cache_decorator.xz" + ) + + assert cached_data_file.exists() and cached_data_file.is_file() + + objs_in_data = [p for p in cache_dir.joinpath("data").glob("*") if p] + + assert len(objs_in_data) == 1 + assert objs_in_data[0] == cache_dir / "data" / "test_cache_decorator" + + objs_in_name_dir = [ + p + for p in cache_dir.joinpath("data/test_cache_decorator").glob("*") + if p + ] + + assert len(objs_in_name_dir) == 1 + assert objs_in_name_dir[0] == cached_data_file.parent + + restored_data = pd.read_csv(cached_data_file) + + pd.testing.assert_frame_equal(data, restored_data, check_index_type=False) + + +def test_cache_data_twice(cache_dir): + init_config(cache_dir) + + load_time = time.perf_counter() + data = decorated_data(name="test_cache_decorator") + load_time = time.perf_counter() - load_time + + assert load_time >= SLEEP_TIME + + load_time = time.perf_counter() + data = decorated_data(name="test_cache_decorator") + load_time = time.perf_counter() - load_time + + assert load_time < SLEEP_TIME diff --git a/tests/test_config.py b/tests/test_config.py index c0523ae..32c0762 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -16,8 +16,7 @@ @pytest.fixture() def config_dir(tmp_path_factory): - config_dir = tmp_path_factory.mktemp(".pygenesis") - return config_dir + return tmp_path_factory.mktemp(".pygenesis") @pytest.fixture(autouse=True) @@ -55,12 +54,17 @@ def test_init_config_with_config_dir(config_dir, caplog): assert caplog.records[1].levelname == "INFO" assert "Settings file updated" in caplog.text assert "New config was created" in caplog.text + assert (config_dir / "data").exists() config = load_config() assert isinstance(config, ConfigParser) assert len(config.sections()) > 0 + assert config["DATA"]["cache_dir"] == str(config_dir / "data") + assert len(list((config_dir / "data").glob("*"))) == 0 + config_file = get_config_path_from_settings() + assert config_file.exists() and config_file.is_file() From 8c10c826b7a4ce8cb9b91a9aec9a282db21553ad Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Mon, 1 Aug 2022 18:55:59 +0200 Subject: [PATCH 2/3] fix pylint errors; refactor code so that common code for downloading data is now in data.py module and table.py and cube.py only hold specific parsing logic --- nb/download_cubefile.ipynb | 12 ++-- nb/download_tablefile.ipynb | 127 +++++++++++++++++++++++++++++++++--- src/pygenesis/cube.py | 30 ++------- src/pygenesis/data.py | 58 ++++++++++++++++ src/pygenesis/table.py | 28 +------- 5 files changed, 188 insertions(+), 67 deletions(-) create mode 100644 src/pygenesis/data.py diff --git a/nb/download_cubefile.ipynb b/nb/download_cubefile.ipynb index f72c13d..eb5a2a2 100644 --- a/nb/download_cubefile.ipynb +++ b/nb/download_cubefile.ipynb @@ -25,30 +25,30 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "4a207a77", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", - "from pygenesis.cube import get_cubefile_data" + "from pygenesis.data import get_data" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "id": "6e6df177", "metadata": {}, "outputs": [], "source": [ "params = {\"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"}\n", - "data = get_cubefile_data(name=\"47414BJ002\", **params)" + "data = get_data(name=\"47414BJ002\", method=\"cubefile\", **params)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "id": "a8bcd5b4", "metadata": {}, "outputs": [ @@ -159,7 +159,7 @@ "4 0.0 " ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } diff --git a/nb/download_tablefile.ipynb b/nb/download_tablefile.ipynb index 5c4fe4d..8bb7743 100644 --- a/nb/download_tablefile.ipynb +++ b/nb/download_tablefile.ipynb @@ -12,12 +12,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", - "from pygenesis.table import get_tablefile_data" + "from pygenesis.data import get_data" ] }, { @@ -36,27 +36,27 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [], "source": [ - "data = get_tablefile_data(name=\"61111-0002\", table_area=all)" + "data = get_data(name=\"61111-0002\", method=\"tablefile\", table_area=all)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(pandas.core.frame.DataFrame, 20)" + "(pandas.core.frame.DataFrame, 24)" ] }, - "execution_count": 4, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -67,7 +67,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -490,6 +490,82 @@ " ...\n", " ...\n", " \n", + " \n", + " 20\n", + " 61111\n", + " Verbraucherpreisindex für Deutschland\n", + " JAHR\n", + " Jahr\n", + " 2022\n", + " DINSG\n", + " Deutschland insgesamt\n", + " DG\n", + " Deutschland\n", + " MONAT\n", + " Monate\n", + " MONAT09\n", + " September\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 21\n", + " 61111\n", + " Verbraucherpreisindex für Deutschland\n", + " JAHR\n", + " Jahr\n", + " 2022\n", + " DINSG\n", + " Deutschland insgesamt\n", + " DG\n", + " Deutschland\n", + " MONAT\n", + " Monate\n", + " MONAT10\n", + " Oktober\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 22\n", + " 61111\n", + " Verbraucherpreisindex für Deutschland\n", + " JAHR\n", + " Jahr\n", + " 2022\n", + " DINSG\n", + " Deutschland insgesamt\n", + " DG\n", + " Deutschland\n", + " MONAT\n", + " Monate\n", + " MONAT11\n", + " November\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 23\n", + " 61111\n", + " Verbraucherpreisindex für Deutschland\n", + " JAHR\n", + " Jahr\n", + " 2022\n", + " DINSG\n", + " Deutschland insgesamt\n", + " DG\n", + " Deutschland\n", + " MONAT\n", + " Monate\n", + " MONAT12\n", + " Dezember\n", + " ...\n", + " ...\n", + " ...\n", + " \n", " \n", "\n", "" @@ -516,6 +592,10 @@ "17 61111 Verbraucherpreisindex für Deutschland JAHR \n", "18 61111 Verbraucherpreisindex für Deutschland JAHR \n", "19 61111 Verbraucherpreisindex für Deutschland JAHR \n", + "20 61111 Verbraucherpreisindex für Deutschland JAHR \n", + "21 61111 Verbraucherpreisindex für Deutschland JAHR \n", + "22 61111 Verbraucherpreisindex für Deutschland JAHR \n", + "23 61111 Verbraucherpreisindex für Deutschland JAHR \n", "\n", " Zeit_Label Zeit 1_Merkmal_Code 1_Merkmal_Label 1_Auspraegung_Code \\\n", "0 Jahr 2021 DINSG Deutschland insgesamt DG \n", @@ -538,6 +618,10 @@ "17 Jahr 2022 DINSG Deutschland insgesamt DG \n", "18 Jahr 2022 DINSG Deutschland insgesamt DG \n", "19 Jahr 2022 DINSG Deutschland insgesamt DG \n", + "20 Jahr 2022 DINSG Deutschland insgesamt DG \n", + "21 Jahr 2022 DINSG Deutschland insgesamt DG \n", + "22 Jahr 2022 DINSG Deutschland insgesamt DG \n", + "23 Jahr 2022 DINSG Deutschland insgesamt DG \n", "\n", " 1_Auspraegung_Label 2_Merkmal_Code 2_Merkmal_Label 2_Auspraegung_Code \\\n", "0 Deutschland MONAT Monate MONAT01 \n", @@ -560,6 +644,10 @@ "17 Deutschland MONAT Monate MONAT06 \n", "18 Deutschland MONAT Monate MONAT07 \n", "19 Deutschland MONAT Monate MONAT08 \n", + "20 Deutschland MONAT Monate MONAT09 \n", + "21 Deutschland MONAT Monate MONAT10 \n", + "22 Deutschland MONAT Monate MONAT11 \n", + "23 Deutschland MONAT Monate MONAT12 \n", "\n", " 2_Auspraegung_Label PREIS1__Verbraucherpreisindex__2015=100 \\\n", "0 Januar 106,3 \n", @@ -582,6 +670,10 @@ "17 Juni 117,4 \n", "18 Juli ... \n", "19 August ... \n", + "20 September ... \n", + "21 Oktober ... \n", + "22 November ... \n", + "23 Dezember ... \n", "\n", " CH0004__Veraenderung_zum_Vorjahresmonat__in_(%) \\\n", "0 +1,0 \n", @@ -604,6 +696,10 @@ "17 +7,6 \n", "18 ... \n", "19 ... \n", + "20 ... \n", + "21 ... \n", + "22 ... \n", + "23 ... \n", "\n", " CH0005__Veraenderung_zum_Vormonat__in_(%) \n", "0 +0,8 \n", @@ -625,10 +721,14 @@ "16 +0,9 \n", "17 +0,1 \n", "18 ... \n", - "19 ... " + "19 ... \n", + "20 ... \n", + "21 ... \n", + "22 ... \n", + "23 ... " ] }, - "execution_count": 5, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -636,6 +736,13 @@ "source": [ "data" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/pygenesis/cube.py b/src/pygenesis/cube.py index 0015b8b..78ce6b2 100644 --- a/src/pygenesis/cube.py +++ b/src/pygenesis/cube.py @@ -3,39 +3,17 @@ import pandas as pd -from pygenesis.cache import cache_data -from pygenesis.http_helper import get_response_from_endpoint - -@cache_data -def get_cubefile_data( - *, name: str, area: str = "all", **kwargs -) -> pd.DataFrame: - """Return cube file data as pandas data frame. - - Based on the cube name, cube area and additional query parameters the - cubefile method from the data-endpoint will be queried. +def get_data(data: str) -> pd.DataFrame: + """Return cubefile data as pandas data frame. Args: - name (str): Name of the cube. - area (str, optional): Area of the cube. Defaults to "all". + data (str): Raw cubefile content. Returns: pd.DataFrame: Parsed cube file. """ - kwargs = kwargs or {} - - params = { - "name": name, - "area": area, - "format": "csv", - } - - params |= kwargs - - response = get_response_from_endpoint("data", "cubefile", params) - cube_data = response.text - cube = rename_axes(parse_cube(cube_data)) + cube = rename_axes(parse_cube(data)) return cube["QEI"] diff --git a/src/pygenesis/data.py b/src/pygenesis/data.py new file mode 100644 index 0000000..ca6e7b9 --- /dev/null +++ b/src/pygenesis/data.py @@ -0,0 +1,58 @@ +"""Provides functionality to download data from GENESIS data endpoint.""" +from typing import Literal + +import pandas as pd + +from pygenesis.cache import cache_data +from pygenesis.cube import parse_cube, rename_axes +from pygenesis.http_helper import get_response_from_endpoint +from pygenesis.table import get_tablefile_data + +METHODS = Literal["tablefile", "cubefile"] + + +@cache_data +def get_data( + *, name: str, method: METHODS, area: str = "all", **kwargs +) -> pd.DataFrame: + """Download data from GENESIS. + + Based on the name, area and additional query parameters the + given method from the data-endpoint will be queried. + + Args: + name (str): Name of the object. + method (str): Method of the data endpoint used to query data. One of ["tablefile", "cubefile"]. + area (str, optional): Area the object is stored. Defaults to "all". + + Returns: + pd.DataFrame: Parsed data file. + """ + kwargs = kwargs or {} + + params = { + "name": name, + "area": area, + } + + if method == "tablefile": + params["format"] = "ffcsv" + + params |= kwargs + + response = get_response_from_endpoint("data", method, params) + data = response.text + + if method == "tablefile": + return _get_tablefile_data(data) + else: + return _get_cubefile_data(data) + + +def _get_cubefile_data(data: str) -> pd.DataFrame: + cube = rename_axes(parse_cube(data)) + return cube["QEI"] + + +def _get_tablefile_data(data: str) -> pd.DataFrame: + return get_tablefile_data(data) diff --git a/src/pygenesis/table.py b/src/pygenesis/table.py index c2e4a20..3580470 100644 --- a/src/pygenesis/table.py +++ b/src/pygenesis/table.py @@ -1,38 +1,16 @@ """Module contains business logic related to destatis tables.""" import pandas as pd -from pygenesis.cache import cache_data from pygenesis.csv_helper import get_df_from_text -from pygenesis.http_helper import get_response_from_endpoint -@cache_data -def get_tablefile_data( - *, name: str, area: str = "all", **kwargs -) -> pd.DataFrame: +def get_tablefile_data(data: str) -> pd.DataFrame: """Return table file data as pandas data frame. - Based on the table name, table area and additional query parameters the - tablefile method from the data-endpoint will be queried. - Args: - name (str): Name of the table. - area (str, optional): Area of the table. Defaults to "all". + data (str): Raw tablefile content. Returns: pd.DataFrame: Parsed table file. """ - - kwargs = kwargs or {} - - params = { - "name": name, - "area": area, - "format": "ffcsv", - } - - params |= kwargs - - response = get_response_from_endpoint("data", "tablefile", params) - - return get_df_from_text(response.text) + return get_df_from_text(data) From 080d56a3a029eda2c18f043748ca658a6d35f803 Mon Sep 17 00:00:00 2001 From: Michael Aydinbas Date: Tue, 9 Aug 2022 09:50:16 +0200 Subject: [PATCH 3/3] remove helper methods from get_data, move logic to the individual moduls --- src/pygenesis/cube.py | 2 +- src/pygenesis/data.py | 15 +++------------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/pygenesis/cube.py b/src/pygenesis/cube.py index 78ce6b2..b71f71c 100644 --- a/src/pygenesis/cube.py +++ b/src/pygenesis/cube.py @@ -4,7 +4,7 @@ import pandas as pd -def get_data(data: str) -> pd.DataFrame: +def get_cubefile_data(data: str) -> pd.DataFrame: """Return cubefile data as pandas data frame. Args: diff --git a/src/pygenesis/data.py b/src/pygenesis/data.py index ca6e7b9..a5211df 100644 --- a/src/pygenesis/data.py +++ b/src/pygenesis/data.py @@ -4,7 +4,7 @@ import pandas as pd from pygenesis.cache import cache_data -from pygenesis.cube import parse_cube, rename_axes +from pygenesis.cube import get_cubefile_data from pygenesis.http_helper import get_response_from_endpoint from pygenesis.table import get_tablefile_data @@ -44,15 +44,6 @@ def get_data( data = response.text if method == "tablefile": - return _get_tablefile_data(data) + return get_tablefile_data(data) else: - return _get_cubefile_data(data) - - -def _get_cubefile_data(data: str) -> pd.DataFrame: - cube = rename_axes(parse_cube(data)) - return cube["QEI"] - - -def _get_tablefile_data(data: str) -> pd.DataFrame: - return get_tablefile_data(data) + return get_cubefile_data(data)