diff --git a/.pylintrc b/.pylintrc index 8d6a9b1..4deabf8 100644 --- a/.pylintrc +++ b/.pylintrc @@ -12,7 +12,7 @@ ignore=third_party # Files or directories matching the regex patterns are skipped. The regex # matches against base names, not paths. -ignore-patterns= +ignore-patterns=test_.* # Pickle collected data for later comparisons. persistent=no diff --git a/nb/download_cubefile.ipynb b/nb/download_cubefile.ipynb new file mode 100644 index 0000000..f72c13d --- /dev/null +++ b/nb/download_cubefile.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "69e1d305", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ff9eca4f", + "metadata": {}, + "outputs": [], + "source": [ + "# only if you get an error from below\n", + "# from pygenesis import init_config\n", + "# init_config()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4a207a77", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from pygenesis.cube import get_cubefile_data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6e6df177", + "metadata": {}, + "outputs": [], + "source": [ + "params = {\"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"}\n", + "data = get_cubefile_data(name=\"47414BJ002\", **params)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a8bcd5b4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DINSGWZ08N7WERTE4JAHRUMS103QUALITAETGESPERRTWERT-VERFAELSCHT
0DGWZ08-49-01NOMINAL2015100.0eNaN0.0
1DGWZ08-49-01NOMINAL201699.3eNaN0.0
2DGWZ08-49-01NOMINAL2017105.7eNaN0.0
3DGWZ08-49-01NOMINAL2018111.6eNaN0.0
4DGWZ08-49-01NOMINAL2019115.6eNaN0.0
\n", + "
" + ], + "text/plain": [ + " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT \\\n", + "0 DG WZ08-49-01 NOMINAL 2015 100.0 e NaN \n", + "1 DG WZ08-49-01 NOMINAL 2016 99.3 e NaN \n", + "2 DG WZ08-49-01 NOMINAL 2017 105.7 e NaN \n", + "3 DG WZ08-49-01 NOMINAL 2018 111.6 e NaN \n", + "4 DG WZ08-49-01 NOMINAL 2019 115.6 e NaN \n", + "\n", + " WERT-VERFAELSCHT \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fed610c9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nb/download_tablefile.ipynb b/nb/download_tablefile.ipynb index 8cef7a0..5c4fe4d 100644 --- a/nb/download_tablefile.ipynb +++ b/nb/download_tablefile.ipynb @@ -14,15 +14,7 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Error while loading the config file. Could not find C:\\Users\\micha\\AppData\\Local\\Temp\\pytest-of-micha\\pytest-78\\.pygenesis3\\config.ini. Please make sure to run init_config() first. \n" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "from pygenesis.table import get_tablefile_data" @@ -44,13 +36,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "scrolled": true }, "outputs": [], "source": [ - "data = get_tablefile_data(\"61111-0002\", table_area=all)" + "data = get_tablefile_data(name=\"61111-0002\", table_area=all)" ] }, { diff --git a/nb/parse_cube.ipynb b/nb/parse_cube.ipynb index e2fa37b..6c84c2e 100644 --- a/nb/parse_cube.ipynb +++ b/nb/parse_cube.ipynb @@ -16,7 +16,7 @@ "metadata": {}, "outputs": [], "source": [ - "# only if you get an error from above\n", + "# only if you get an error from below\n", "# from pygenesis import init_config\n", "# init_config()" ] @@ -29,8 +29,8 @@ "source": [ "import pandas as pd\n", "\n", - "from pygenesis.destatis import get_cubefile\n", - "from pygenesis.cube import parse_cube, rename_axes" + "from pygenesis.cube import parse_cube, rename_axes\n", + "from pygenesis.http_helper import get_response_from_endpoint" ] }, { @@ -53,23 +53,16 @@ "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "D:\\git\\correlaid\\genesis-python\\.venv\\lib\\site-packages\\urllib3\\connectionpool.py:1043: InsecureRequestWarning: Unverified HTTPS request is being made to host 'www-genesis.destatis.de'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", - " warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ - "data = get_cubefile({\"name\": \"47414BJ002\", \"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"})" + "params = {\"name\": \"47414BJ002\", \"area\": \"all\", \"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"}\n", + "response = get_response_from_endpoint(\"data\", \"cubefile\", params)\n", + "data = response.text" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -78,7 +71,7 @@ "(str, 79264)" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -89,13 +82,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['* Der Benutzer DEI6I4B3UW der Benutzergruppe DE0142 hat am 30.07.2022 um 20:43:34 diesen Export angestossen.',\n", + "['* Der Benutzer DEI6I4B3UW der Benutzergruppe DE0142 hat am 01.08.2022 um 08:16:00 diesen Export angestossen.',\n", " 'K;DQ;FACH-SCHL;GHH-ART;GHM-WERTE-JN;GENESIS-VBD;REGIOSTAT;EU-VBD;\"mit Werten\"',\n", " 'D;47414BJ002;;N;N;N;N',\n", " 'K;DQ-ERH;FACH-SCHL',\n", @@ -117,7 +110,7 @@ " 'D;DG;WZ08-49-01;NOMINAL;2020;96.0;e;;0.0']" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -137,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -146,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -164,36 +157,23 @@ " 0 JAHR 4 4,\n", " 'DQI': NAME ME-NAME DST TYP NKM-STELLEN GHH-ART GHM-WERTE-JN\n", " 0 UMS103 2015=100 FEST PROZENT 1 N,\n", - " 'QEI': FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT \\\n", - " 0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n", - " 1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n", - " 2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n", - " 3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n", - " 4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n", - " ... ... ... ... ... ... ... ... \n", - " 2018 DG WZ08-N REAL 2017 108.4 e \n", - " 2019 DG WZ08-N REAL 2018 110.6 e \n", - " 2020 DG WZ08-N REAL 2019 110.8 e \n", - " 2021 DG WZ08-N REAL 2020 94.1 e \n", - " 2022 DG WZ08-N REAL 2021 101.2 p \n", - " \n", - " WERT-VERFAELSCHT \n", - " 0 0.0 \n", - " 1 0.0 \n", - " 2 0.0 \n", - " 3 0.0 \n", - " 4 0.0 \n", - " ... ... \n", - " 2018 0.0 \n", - " 2019 0.0 \n", - " 2020 0.0 \n", - " 2021 0.0 \n", - " 2022 0.0 \n", + " 'QEI': FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT WERT-VERFAELSCHT\n", + " 0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n", + " 1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n", + " 2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n", + " 3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n", + " 4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n", + " ... ... ... ... ... ... ... ... ...\n", + " 2018 DG WZ08-N REAL 2017 108.4 e 0.0\n", + " 2019 DG WZ08-N REAL 2018 110.6 e 0.0\n", + " 2020 DG WZ08-N REAL 2019 110.8 e 0.0\n", + " 2021 DG WZ08-N REAL 2020 94.1 e 0.0\n", + " 2022 DG WZ08-N REAL 2021 101.2 p 0.0\n", " \n", " [2023 rows x 8 columns]}" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -204,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -366,36 +346,23 @@ "" ], "text/plain": [ - " FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT \\\n", - "0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n", - "1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n", - "2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n", - "3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n", - "4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n", - "... ... ... ... ... ... ... ... \n", - "2018 DG WZ08-N REAL 2017 108.4 e \n", - "2019 DG WZ08-N REAL 2018 110.6 e \n", - "2020 DG WZ08-N REAL 2019 110.8 e \n", - "2021 DG WZ08-N REAL 2020 94.1 e \n", - "2022 DG WZ08-N REAL 2021 101.2 p \n", - "\n", - " WERT-VERFAELSCHT \n", - "0 0.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 0.0 \n", - "... ... \n", - "2018 0.0 \n", - "2019 0.0 \n", - "2020 0.0 \n", - "2021 0.0 \n", - "2022 0.0 \n", + " FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT WERT-VERFAELSCHT\n", + "0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n", + "1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n", + "2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n", + "3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n", + "4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n", + "... ... ... ... ... ... ... ... ...\n", + "2018 DG WZ08-N REAL 2017 108.4 e 0.0\n", + "2019 DG WZ08-N REAL 2018 110.6 e 0.0\n", + "2020 DG WZ08-N REAL 2019 110.8 e 0.0\n", + "2021 DG WZ08-N REAL 2020 94.1 e 0.0\n", + "2022 DG WZ08-N REAL 2021 101.2 p 0.0\n", "\n", "[2023 rows x 8 columns]" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -406,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -444,13 +411,13 @@ " \n", " \n", " 1\n", - " ALT041\n", + " WZ08N7\n", " 2\n", " 2\n", " \n", " \n", " 2\n", - " FAMST2\n", + " WERTE4\n", " 3\n", " 3\n", " \n", @@ -461,11 +428,11 @@ "text/plain": [ " NAME RHF-BSR RHF-ACHSE\n", "0 DINSG 1 1\n", - "1 ALT041 2 2\n", - "2 FAMST2 3 3" + "1 WZ08N7 2 2\n", + "2 WERTE4 3 3" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -485,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -647,36 +614,23 @@ "" ], "text/plain": [ - " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT \\\n", - "0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n", - "1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n", - "2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n", - "3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n", - "4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n", - "... ... ... ... ... ... ... ... \n", - "2018 DG WZ08-N REAL 2017 108.4 e \n", - "2019 DG WZ08-N REAL 2018 110.6 e \n", - "2020 DG WZ08-N REAL 2019 110.8 e \n", - "2021 DG WZ08-N REAL 2020 94.1 e \n", - "2022 DG WZ08-N REAL 2021 101.2 p \n", - "\n", - " WERT-VERFAELSCHT \n", - "0 0.0 \n", - "1 0.0 \n", - "2 0.0 \n", - "3 0.0 \n", - "4 0.0 \n", - "... ... \n", - "2018 0.0 \n", - "2019 0.0 \n", - "2020 0.0 \n", - "2021 0.0 \n", - "2022 0.0 \n", + " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT WERT-VERFAELSCHT\n", + "0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n", + "1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n", + "2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n", + "3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n", + "4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n", + "... ... ... ... ... ... ... ... ...\n", + "2018 DG WZ08-N REAL 2017 108.4 e 0.0\n", + "2019 DG WZ08-N REAL 2018 110.6 e 0.0\n", + "2020 DG WZ08-N REAL 2019 110.8 e 0.0\n", + "2021 DG WZ08-N REAL 2020 94.1 e 0.0\n", + "2022 DG WZ08-N REAL 2021 101.2 p 0.0\n", "\n", "[2023 rows x 8 columns]" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } diff --git a/src/pygenesis/cache.py b/src/pygenesis/cache.py new file mode 100644 index 0000000..ca5372e --- /dev/null +++ b/src/pygenesis/cache.py @@ -0,0 +1,52 @@ +"""Module provides functions/decorators to cache downloaded data.""" +import logging +from datetime import date +from functools import wraps +from pathlib import Path +from typing import Callable + +import pandas as pd + +from pygenesis.config import load_config + +logger = logging.getLogger(__name__) + + +def cache_data(func: Callable) -> Callable: + """Store downloaded data on disk with download time as parent folder. + + Args: + func (Callable): One of the data methods of the data endpoint. + """ + + @wraps(func) + def wrapper_func(**kwargs): + config = load_config() + cache_dir = Path(config["DATA"]["cache_dir"]) + + if not cache_dir.is_dir() or not cache_dir.exists(): + logger.critical( + "Cache dir does not exist! Please make sure init_config() was run properly. Path: %s", + cache_dir, + ) + + name = kwargs["name"] + data_dir = cache_dir / name + if data_dir.exists(): + # TODO: Implement solution for updated data. + # So don't return latest version but check first for newer version in GENESIS. + # if data_dir exists, there has to be at least one stored version of this data + versions = sorted((p.name for p in data_dir.glob("*")), key=int) + latest = versions[-1] + data = pd.read_csv(data_dir / latest / f"{name}.xz") + else: + data: pd.DateFrame = func(**kwargs) + file_path = ( + data_dir / str(date.today()).replace("-", "") / f"{name}.xz" + ) + file_path.parent.mkdir(parents=True, exist_ok=True) + data.to_csv(file_path, index=False) + + return data + + return wrapper_func diff --git a/src/pygenesis/config.py b/src/pygenesis/config.py index 9018a68..8f48d2b 100644 --- a/src/pygenesis/config.py +++ b/src/pygenesis/config.py @@ -77,6 +77,10 @@ def init_config(config_dir: Path = DEFAULT_CONFIG_DIR) -> None: config = _create_default_config() _write_config(config, config_file) + cache_dir = Path(config["DATA"]["cache_dir"]) + if not cache_dir.exists(): + cache_dir.mkdir() + logger.info("New config was created. Path: %s.", config_file) diff --git a/src/pygenesis/cube.py b/src/pygenesis/cube.py index 9df44ee..0015b8b 100644 --- a/src/pygenesis/cube.py +++ b/src/pygenesis/cube.py @@ -3,78 +3,41 @@ import pandas as pd +from pygenesis.cache import cache_data +from pygenesis.http_helper import get_response_from_endpoint -def is_cube_metadata_header(line: str) -> bool: - """Check if a line is a cube metadata header. - Args: - line (str): A single line of a cubefile. - - Returns: - bool: True if the line starts with a "K", False otherwise. - """ - return line[0] == "K" +@cache_data +def get_cubefile_data( + *, name: str, area: str = "all", **kwargs +) -> pd.DataFrame: + """Return cube file data as pandas data frame. - -def get_cube_metadata_header_type(line: str) -> str: - """Return the header type. + Based on the cube name, cube area and additional query parameters the + cubefile method from the data-endpoint will be queried. Args: - line (str): A single line of a cubefile. + name (str): Name of the cube. + area (str, optional): Area of the cube. Defaults to "all". Returns: - str: The header type, which is the second entry in the header. + pd.DataFrame: Parsed cube file. """ - return line.split(";")[1] - - -def get_cube_metadata_header( - line: str, rename_duplicates: bool = False -) -> list[str]: - """Return the metadata header of a cubefile. - - Args: - line (str): A single line of a cubefile. - rename_duplicates (bool, optional): If False, the raw header is returned. - If True, identical column names are appended with a unique counter. - Defaults to False. - - Returns: - list[str]: A list of column names, except for "nur Werte" and "mit Werten". - """ - raw_header = line.split(";")[2:] - raw_header = [ - name - for name in raw_header - if name not in ['"nur Werte"', '"mit Werten"'] - ] + kwargs = kwargs or {} - if not rename_duplicates: - return raw_header + params = { + "name": name, + "area": area, + "format": "csv", + } - # header can have multiple entries with same label, which is problematic for pandas - # so lets just add a counter - header = [""] * len(raw_header) - for name in set(raw_header): - if raw_header.count(name) == 1: - header[raw_header.index(name)] = name - else: - for counter in range(raw_header.count(name)): - header[raw_header.index(name) + counter] = f"{name}-{counter+1}" + params |= kwargs - return header + response = get_response_from_endpoint("data", "cubefile", params) + cube_data = response.text + cube = rename_axes(parse_cube(cube_data)) - -def parse_cube_data_line(line: str) -> list[str]: - """Return the content of a cube data line. - - Args: - line (str): A single line of a cubefile. - - Returns: - list[str]: The content of a cube data line, omitting the first element. - """ - return line.split(";")[1:] + return cube["QEI"] def parse_cube(data: str) -> dict: @@ -92,19 +55,19 @@ def parse_cube(data: str) -> dict: for line in data.splitlines(): # skip all rows until first header - if header is None and not is_cube_metadata_header(line): + if header is None and not _is_cube_metadata_header(line): continue - if is_cube_metadata_header(line): + if _is_cube_metadata_header(line): if data_block: cube[header_type] = pd.DataFrame(data_block, columns=header) - header = get_cube_metadata_header(line, rename_duplicates=True) - header_type: str = get_cube_metadata_header_type(line) + header = _get_cube_metadata_header(line, rename_duplicates=True) + header_type: str = _get_cube_metadata_header_type(line) data_block = [] continue - line_content = parse_cube_data_line(line) + line_content = _parse_cube_data_line(line) data_block.append(line_content) # the last data block has no header after it so we have to do it here @@ -157,3 +120,45 @@ def rename_axes( cube["QEI"].rename(columns=dict(zip(old_cols, new_cols)), inplace=True) return cube + + +def _is_cube_metadata_header(line: str) -> bool: + """Check if a line is a cube metadata header.""" + return line[0] == "K" + + +def _get_cube_metadata_header_type(line: str) -> str: + """Return the header type.""" + return line.split(";")[1] + + +def _get_cube_metadata_header( + line: str, rename_duplicates: bool = False +) -> list[str]: + """Return the metadata header of a cubefile.""" + raw_header = line.split(";")[2:] + raw_header = [ + name + for name in raw_header + if name not in ['"nur Werte"', '"mit Werten"'] + ] + + if not rename_duplicates: + return raw_header + + # header can have multiple entries with same label, which is problematic for pandas + # so lets just add a counter + header = [""] * len(raw_header) + for name in set(raw_header): + if raw_header.count(name) == 1: + header[raw_header.index(name)] = name + else: + for counter in range(raw_header.count(name)): + header[raw_header.index(name) + counter] = f"{name}-{counter+1}" + + return header + + +def _parse_cube_data_line(line: str) -> list[str]: + """Return the content of a cube data line.""" + return line.split(";")[1:] diff --git a/src/pygenesis/table.py b/src/pygenesis/table.py index af9b3c4..c2e4a20 100644 --- a/src/pygenesis/table.py +++ b/src/pygenesis/table.py @@ -1,35 +1,38 @@ """Module contains business logic related to destatis tables.""" import pandas as pd +from pygenesis.cache import cache_data from pygenesis.csv_helper import get_df_from_text from pygenesis.http_helper import get_response_from_endpoint +@cache_data def get_tablefile_data( - table_name: str, table_area: str = "all", **kwargs + *, name: str, area: str = "all", **kwargs ) -> pd.DataFrame: - """ + """Return table file data as pandas data frame. + Based on the table name, table area and additional query parameters the tablefile method from the data-endpoint will be queried. Args: - table_name (str): Name of the table - table_area (str, optional): Area of the table (Defaul: all) - query_params (dict, optional): Additional query parameters - (Default: None) + name (str): Name of the table. + area (str, optional): Area of the table. Defaults to "all". + Returns: - pd.DataFrame + pd.DataFrame: Parsed table file. """ kwargs = kwargs or {} params = { - "name": table_name, - "area": table_area, + "name": name, + "area": area, "format": "ffcsv", } params |= kwargs response = get_response_from_endpoint("data", "tablefile", params) + return get_df_from_text(response.text) diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..919436e --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,92 @@ +import time +from datetime import date +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from pygenesis.cache import cache_data +from pygenesis.config import ( + DEFAULT_SETTINGS_FILE, + _write_config, + init_config, + load_settings, +) + +SLEEP_TIME = 0.1 + + +@pytest.fixture() +def cache_dir(tmp_path_factory): + return tmp_path_factory.mktemp(".pygenesis") + + +@pytest.fixture(autouse=True) +def restore_settings(): + old_settings = load_settings() + yield + _write_config(old_settings, DEFAULT_SETTINGS_FILE) + + +@cache_data +def decorated_data(*, name): + time.sleep(SLEEP_TIME) + return pd.DataFrame( + np.random.random(size=(10, 5)), columns=["a", "b", "c", "d", "e"] + ) + + +def test_cache_data_wrapper(cache_dir): + init_config(cache_dir) + + assert len(list((cache_dir / "data").glob("*"))) == 0 + + data = decorated_data(name="test_cache_decorator") + + assert isinstance(data, pd.DataFrame) + assert not data.empty + + cached_data_file: Path = ( + cache_dir + / "data" + / "test_cache_decorator" + / str(date.today()).replace("-", "") + / "test_cache_decorator.xz" + ) + + assert cached_data_file.exists() and cached_data_file.is_file() + + objs_in_data = [p for p in cache_dir.joinpath("data").glob("*") if p] + + assert len(objs_in_data) == 1 + assert objs_in_data[0] == cache_dir / "data" / "test_cache_decorator" + + objs_in_name_dir = [ + p + for p in cache_dir.joinpath("data/test_cache_decorator").glob("*") + if p + ] + + assert len(objs_in_name_dir) == 1 + assert objs_in_name_dir[0] == cached_data_file.parent + + restored_data = pd.read_csv(cached_data_file) + + pd.testing.assert_frame_equal(data, restored_data, check_index_type=False) + + +def test_cache_data_twice(cache_dir): + init_config(cache_dir) + + load_time = time.perf_counter() + data = decorated_data(name="test_cache_decorator") + load_time = time.perf_counter() - load_time + + assert load_time >= SLEEP_TIME + + load_time = time.perf_counter() + data = decorated_data(name="test_cache_decorator") + load_time = time.perf_counter() - load_time + + assert load_time < SLEEP_TIME diff --git a/tests/test_config.py b/tests/test_config.py index c0523ae..32c0762 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -16,8 +16,7 @@ @pytest.fixture() def config_dir(tmp_path_factory): - config_dir = tmp_path_factory.mktemp(".pygenesis") - return config_dir + return tmp_path_factory.mktemp(".pygenesis") @pytest.fixture(autouse=True) @@ -55,12 +54,17 @@ def test_init_config_with_config_dir(config_dir, caplog): assert caplog.records[1].levelname == "INFO" assert "Settings file updated" in caplog.text assert "New config was created" in caplog.text + assert (config_dir / "data").exists() config = load_config() assert isinstance(config, ConfigParser) assert len(config.sections()) > 0 + assert config["DATA"]["cache_dir"] == str(config_dir / "data") + assert len(list((config_dir / "data").glob("*"))) == 0 + config_file = get_config_path_from_settings() + assert config_file.exists() and config_file.is_file()