diff --git a/.pylintrc b/.pylintrc
index 8d6a9b1..4deabf8 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -12,7 +12,7 @@ ignore=third_party
# Files or directories matching the regex patterns are skipped. The regex
# matches against base names, not paths.
-ignore-patterns=
+ignore-patterns=test_.*
# Pickle collected data for later comparisons.
persistent=no
diff --git a/nb/download_cubefile.ipynb b/nb/download_cubefile.ipynb
new file mode 100644
index 0000000..f72c13d
--- /dev/null
+++ b/nb/download_cubefile.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "69e1d305",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "ff9eca4f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# only if you get an error from below\n",
+ "# from pygenesis import init_config\n",
+ "# init_config()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "4a207a77",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "from pygenesis.cube import get_cubefile_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "6e6df177",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = {\"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"}\n",
+ "data = get_cubefile_data(name=\"47414BJ002\", **params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "a8bcd5b4",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " DINSG | \n",
+ " WZ08N7 | \n",
+ " WERTE4 | \n",
+ " JAHR | \n",
+ " UMS103 | \n",
+ " QUALITAET | \n",
+ " GESPERRT | \n",
+ " WERT-VERFAELSCHT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " DG | \n",
+ " WZ08-49-01 | \n",
+ " NOMINAL | \n",
+ " 2015 | \n",
+ " 100.0 | \n",
+ " e | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " DG | \n",
+ " WZ08-49-01 | \n",
+ " NOMINAL | \n",
+ " 2016 | \n",
+ " 99.3 | \n",
+ " e | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DG | \n",
+ " WZ08-49-01 | \n",
+ " NOMINAL | \n",
+ " 2017 | \n",
+ " 105.7 | \n",
+ " e | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " DG | \n",
+ " WZ08-49-01 | \n",
+ " NOMINAL | \n",
+ " 2018 | \n",
+ " 111.6 | \n",
+ " e | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " DG | \n",
+ " WZ08-49-01 | \n",
+ " NOMINAL | \n",
+ " 2019 | \n",
+ " 115.6 | \n",
+ " e | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT \\\n",
+ "0 DG WZ08-49-01 NOMINAL 2015 100.0 e NaN \n",
+ "1 DG WZ08-49-01 NOMINAL 2016 99.3 e NaN \n",
+ "2 DG WZ08-49-01 NOMINAL 2017 105.7 e NaN \n",
+ "3 DG WZ08-49-01 NOMINAL 2018 111.6 e NaN \n",
+ "4 DG WZ08-49-01 NOMINAL 2019 115.6 e NaN \n",
+ "\n",
+ " WERT-VERFAELSCHT \n",
+ "0 0.0 \n",
+ "1 0.0 \n",
+ "2 0.0 \n",
+ "3 0.0 \n",
+ "4 0.0 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fed610c9",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/nb/download_tablefile.ipynb b/nb/download_tablefile.ipynb
index 8cef7a0..5c4fe4d 100644
--- a/nb/download_tablefile.ipynb
+++ b/nb/download_tablefile.ipynb
@@ -14,15 +14,7 @@
"cell_type": "code",
"execution_count": 2,
"metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Error while loading the config file. Could not find C:\\Users\\micha\\AppData\\Local\\Temp\\pytest-of-micha\\pytest-78\\.pygenesis3\\config.ini. Please make sure to run init_config() first. \n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"import pandas as pd\n",
"from pygenesis.table import get_tablefile_data"
@@ -44,13 +36,13 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
- "data = get_tablefile_data(\"61111-0002\", table_area=all)"
+ "data = get_tablefile_data(name=\"61111-0002\", table_area=all)"
]
},
{
diff --git a/nb/parse_cube.ipynb b/nb/parse_cube.ipynb
index e2fa37b..6c84c2e 100644
--- a/nb/parse_cube.ipynb
+++ b/nb/parse_cube.ipynb
@@ -16,7 +16,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# only if you get an error from above\n",
+ "# only if you get an error from below\n",
"# from pygenesis import init_config\n",
"# init_config()"
]
@@ -29,8 +29,8 @@
"source": [
"import pandas as pd\n",
"\n",
- "from pygenesis.destatis import get_cubefile\n",
- "from pygenesis.cube import parse_cube, rename_axes"
+ "from pygenesis.cube import parse_cube, rename_axes\n",
+ "from pygenesis.http_helper import get_response_from_endpoint"
]
},
{
@@ -53,23 +53,16 @@
"metadata": {
"scrolled": true
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "D:\\git\\correlaid\\genesis-python\\.venv\\lib\\site-packages\\urllib3\\connectionpool.py:1043: InsecureRequestWarning: Unverified HTTPS request is being made to host 'www-genesis.destatis.de'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
- " warnings.warn(\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "data = get_cubefile({\"name\": \"47414BJ002\", \"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"})"
+ "params = {\"name\": \"47414BJ002\", \"area\": \"all\", \"values\": \"true\", \"metadata\": \"true\", \"additionals\": \"false\"}\n",
+ "response = get_response_from_endpoint(\"data\", \"cubefile\", params)\n",
+ "data = response.text"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -78,7 +71,7 @@
"(str, 79264)"
]
},
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -89,13 +82,13 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "['* Der Benutzer DEI6I4B3UW der Benutzergruppe DE0142 hat am 30.07.2022 um 20:43:34 diesen Export angestossen.',\n",
+ "['* Der Benutzer DEI6I4B3UW der Benutzergruppe DE0142 hat am 01.08.2022 um 08:16:00 diesen Export angestossen.',\n",
" 'K;DQ;FACH-SCHL;GHH-ART;GHM-WERTE-JN;GENESIS-VBD;REGIOSTAT;EU-VBD;\"mit Werten\"',\n",
" 'D;47414BJ002;;N;N;N;N',\n",
" 'K;DQ-ERH;FACH-SCHL',\n",
@@ -117,7 +110,7 @@
" 'D;DG;WZ08-49-01;NOMINAL;2020;96.0;e;;0.0']"
]
},
- "execution_count": 5,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -137,7 +130,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -146,7 +139,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
@@ -164,36 +157,23 @@
" 0 JAHR 4 4,\n",
" 'DQI': NAME ME-NAME DST TYP NKM-STELLEN GHH-ART GHM-WERTE-JN\n",
" 0 UMS103 2015=100 FEST PROZENT 1 N,\n",
- " 'QEI': FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT \\\n",
- " 0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n",
- " 1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n",
- " 2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n",
- " 3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n",
- " 4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n",
- " ... ... ... ... ... ... ... ... \n",
- " 2018 DG WZ08-N REAL 2017 108.4 e \n",
- " 2019 DG WZ08-N REAL 2018 110.6 e \n",
- " 2020 DG WZ08-N REAL 2019 110.8 e \n",
- " 2021 DG WZ08-N REAL 2020 94.1 e \n",
- " 2022 DG WZ08-N REAL 2021 101.2 p \n",
- " \n",
- " WERT-VERFAELSCHT \n",
- " 0 0.0 \n",
- " 1 0.0 \n",
- " 2 0.0 \n",
- " 3 0.0 \n",
- " 4 0.0 \n",
- " ... ... \n",
- " 2018 0.0 \n",
- " 2019 0.0 \n",
- " 2020 0.0 \n",
- " 2021 0.0 \n",
- " 2022 0.0 \n",
+ " 'QEI': FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT WERT-VERFAELSCHT\n",
+ " 0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n",
+ " 1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n",
+ " 2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n",
+ " 3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n",
+ " 4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n",
+ " ... ... ... ... ... ... ... ... ...\n",
+ " 2018 DG WZ08-N REAL 2017 108.4 e 0.0\n",
+ " 2019 DG WZ08-N REAL 2018 110.6 e 0.0\n",
+ " 2020 DG WZ08-N REAL 2019 110.8 e 0.0\n",
+ " 2021 DG WZ08-N REAL 2020 94.1 e 0.0\n",
+ " 2022 DG WZ08-N REAL 2021 101.2 p 0.0\n",
" \n",
" [2023 rows x 8 columns]}"
]
},
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -204,7 +184,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -366,36 +346,23 @@
""
],
"text/plain": [
- " FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT \\\n",
- "0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n",
- "1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n",
- "2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n",
- "3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n",
- "4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n",
- "... ... ... ... ... ... ... ... \n",
- "2018 DG WZ08-N REAL 2017 108.4 e \n",
- "2019 DG WZ08-N REAL 2018 110.6 e \n",
- "2020 DG WZ08-N REAL 2019 110.8 e \n",
- "2021 DG WZ08-N REAL 2020 94.1 e \n",
- "2022 DG WZ08-N REAL 2021 101.2 p \n",
- "\n",
- " WERT-VERFAELSCHT \n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "... ... \n",
- "2018 0.0 \n",
- "2019 0.0 \n",
- "2020 0.0 \n",
- "2021 0.0 \n",
- "2022 0.0 \n",
+ " FACH-SCHL-1 FACH-SCHL-2 FACH-SCHL-3 ZI-WERT WERT QUALITAET GESPERRT WERT-VERFAELSCHT\n",
+ "0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n",
+ "1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n",
+ "2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n",
+ "3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n",
+ "4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n",
+ "... ... ... ... ... ... ... ... ...\n",
+ "2018 DG WZ08-N REAL 2017 108.4 e 0.0\n",
+ "2019 DG WZ08-N REAL 2018 110.6 e 0.0\n",
+ "2020 DG WZ08-N REAL 2019 110.8 e 0.0\n",
+ "2021 DG WZ08-N REAL 2020 94.1 e 0.0\n",
+ "2022 DG WZ08-N REAL 2021 101.2 p 0.0\n",
"\n",
"[2023 rows x 8 columns]"
]
},
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -406,7 +373,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -444,13 +411,13 @@
" \n",
" \n",
" 1 | \n",
- " ALT041 | \n",
+ " WZ08N7 | \n",
" 2 | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
- " FAMST2 | \n",
+ " WERTE4 | \n",
" 3 | \n",
" 3 | \n",
"
\n",
@@ -461,11 +428,11 @@
"text/plain": [
" NAME RHF-BSR RHF-ACHSE\n",
"0 DINSG 1 1\n",
- "1 ALT041 2 2\n",
- "2 FAMST2 3 3"
+ "1 WZ08N7 2 2\n",
+ "2 WERTE4 3 3"
]
},
- "execution_count": 9,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -485,7 +452,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -647,36 +614,23 @@
""
],
"text/plain": [
- " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT \\\n",
- "0 DG WZ08-49-01 NOMINAL 2015 100.0 e \n",
- "1 DG WZ08-49-01 NOMINAL 2016 99.3 e \n",
- "2 DG WZ08-49-01 NOMINAL 2017 105.7 e \n",
- "3 DG WZ08-49-01 NOMINAL 2018 111.6 e \n",
- "4 DG WZ08-49-01 NOMINAL 2019 115.6 e \n",
- "... ... ... ... ... ... ... ... \n",
- "2018 DG WZ08-N REAL 2017 108.4 e \n",
- "2019 DG WZ08-N REAL 2018 110.6 e \n",
- "2020 DG WZ08-N REAL 2019 110.8 e \n",
- "2021 DG WZ08-N REAL 2020 94.1 e \n",
- "2022 DG WZ08-N REAL 2021 101.2 p \n",
- "\n",
- " WERT-VERFAELSCHT \n",
- "0 0.0 \n",
- "1 0.0 \n",
- "2 0.0 \n",
- "3 0.0 \n",
- "4 0.0 \n",
- "... ... \n",
- "2018 0.0 \n",
- "2019 0.0 \n",
- "2020 0.0 \n",
- "2021 0.0 \n",
- "2022 0.0 \n",
+ " DINSG WZ08N7 WERTE4 JAHR UMS103 QUALITAET GESPERRT WERT-VERFAELSCHT\n",
+ "0 DG WZ08-49-01 NOMINAL 2015 100.0 e 0.0\n",
+ "1 DG WZ08-49-01 NOMINAL 2016 99.3 e 0.0\n",
+ "2 DG WZ08-49-01 NOMINAL 2017 105.7 e 0.0\n",
+ "3 DG WZ08-49-01 NOMINAL 2018 111.6 e 0.0\n",
+ "4 DG WZ08-49-01 NOMINAL 2019 115.6 e 0.0\n",
+ "... ... ... ... ... ... ... ... ...\n",
+ "2018 DG WZ08-N REAL 2017 108.4 e 0.0\n",
+ "2019 DG WZ08-N REAL 2018 110.6 e 0.0\n",
+ "2020 DG WZ08-N REAL 2019 110.8 e 0.0\n",
+ "2021 DG WZ08-N REAL 2020 94.1 e 0.0\n",
+ "2022 DG WZ08-N REAL 2021 101.2 p 0.0\n",
"\n",
"[2023 rows x 8 columns]"
]
},
- "execution_count": 13,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/src/pygenesis/cache.py b/src/pygenesis/cache.py
new file mode 100644
index 0000000..ca5372e
--- /dev/null
+++ b/src/pygenesis/cache.py
@@ -0,0 +1,52 @@
+"""Module provides functions/decorators to cache downloaded data."""
+import logging
+from datetime import date
+from functools import wraps
+from pathlib import Path
+from typing import Callable
+
+import pandas as pd
+
+from pygenesis.config import load_config
+
+logger = logging.getLogger(__name__)
+
+
+def cache_data(func: Callable) -> Callable:
+ """Store downloaded data on disk with download time as parent folder.
+
+ Args:
+ func (Callable): One of the data methods of the data endpoint.
+ """
+
+ @wraps(func)
+ def wrapper_func(**kwargs):
+ config = load_config()
+ cache_dir = Path(config["DATA"]["cache_dir"])
+
+ if not cache_dir.is_dir() or not cache_dir.exists():
+ logger.critical(
+ "Cache dir does not exist! Please make sure init_config() was run properly. Path: %s",
+ cache_dir,
+ )
+
+ name = kwargs["name"]
+ data_dir = cache_dir / name
+ if data_dir.exists():
+ # TODO: Implement solution for updated data.
+ # So don't return latest version but check first for newer version in GENESIS.
+ # if data_dir exists, there has to be at least one stored version of this data
+ versions = sorted((p.name for p in data_dir.glob("*")), key=int)
+ latest = versions[-1]
+ data = pd.read_csv(data_dir / latest / f"{name}.xz")
+ else:
+ data: pd.DateFrame = func(**kwargs)
+ file_path = (
+ data_dir / str(date.today()).replace("-", "") / f"{name}.xz"
+ )
+ file_path.parent.mkdir(parents=True, exist_ok=True)
+ data.to_csv(file_path, index=False)
+
+ return data
+
+ return wrapper_func
diff --git a/src/pygenesis/config.py b/src/pygenesis/config.py
index 9018a68..8f48d2b 100644
--- a/src/pygenesis/config.py
+++ b/src/pygenesis/config.py
@@ -77,6 +77,10 @@ def init_config(config_dir: Path = DEFAULT_CONFIG_DIR) -> None:
config = _create_default_config()
_write_config(config, config_file)
+ cache_dir = Path(config["DATA"]["cache_dir"])
+ if not cache_dir.exists():
+ cache_dir.mkdir()
+
logger.info("New config was created. Path: %s.", config_file)
diff --git a/src/pygenesis/cube.py b/src/pygenesis/cube.py
index 9df44ee..0015b8b 100644
--- a/src/pygenesis/cube.py
+++ b/src/pygenesis/cube.py
@@ -3,78 +3,41 @@
import pandas as pd
+from pygenesis.cache import cache_data
+from pygenesis.http_helper import get_response_from_endpoint
-def is_cube_metadata_header(line: str) -> bool:
- """Check if a line is a cube metadata header.
- Args:
- line (str): A single line of a cubefile.
-
- Returns:
- bool: True if the line starts with a "K", False otherwise.
- """
- return line[0] == "K"
+@cache_data
+def get_cubefile_data(
+ *, name: str, area: str = "all", **kwargs
+) -> pd.DataFrame:
+ """Return cube file data as pandas data frame.
-
-def get_cube_metadata_header_type(line: str) -> str:
- """Return the header type.
+ Based on the cube name, cube area and additional query parameters the
+ cubefile method from the data-endpoint will be queried.
Args:
- line (str): A single line of a cubefile.
+ name (str): Name of the cube.
+ area (str, optional): Area of the cube. Defaults to "all".
Returns:
- str: The header type, which is the second entry in the header.
+ pd.DataFrame: Parsed cube file.
"""
- return line.split(";")[1]
-
-
-def get_cube_metadata_header(
- line: str, rename_duplicates: bool = False
-) -> list[str]:
- """Return the metadata header of a cubefile.
-
- Args:
- line (str): A single line of a cubefile.
- rename_duplicates (bool, optional): If False, the raw header is returned.
- If True, identical column names are appended with a unique counter.
- Defaults to False.
-
- Returns:
- list[str]: A list of column names, except for "nur Werte" and "mit Werten".
- """
- raw_header = line.split(";")[2:]
- raw_header = [
- name
- for name in raw_header
- if name not in ['"nur Werte"', '"mit Werten"']
- ]
+ kwargs = kwargs or {}
- if not rename_duplicates:
- return raw_header
+ params = {
+ "name": name,
+ "area": area,
+ "format": "csv",
+ }
- # header can have multiple entries with same label, which is problematic for pandas
- # so lets just add a counter
- header = [""] * len(raw_header)
- for name in set(raw_header):
- if raw_header.count(name) == 1:
- header[raw_header.index(name)] = name
- else:
- for counter in range(raw_header.count(name)):
- header[raw_header.index(name) + counter] = f"{name}-{counter+1}"
+ params |= kwargs
- return header
+ response = get_response_from_endpoint("data", "cubefile", params)
+ cube_data = response.text
+ cube = rename_axes(parse_cube(cube_data))
-
-def parse_cube_data_line(line: str) -> list[str]:
- """Return the content of a cube data line.
-
- Args:
- line (str): A single line of a cubefile.
-
- Returns:
- list[str]: The content of a cube data line, omitting the first element.
- """
- return line.split(";")[1:]
+ return cube["QEI"]
def parse_cube(data: str) -> dict:
@@ -92,19 +55,19 @@ def parse_cube(data: str) -> dict:
for line in data.splitlines():
# skip all rows until first header
- if header is None and not is_cube_metadata_header(line):
+ if header is None and not _is_cube_metadata_header(line):
continue
- if is_cube_metadata_header(line):
+ if _is_cube_metadata_header(line):
if data_block:
cube[header_type] = pd.DataFrame(data_block, columns=header)
- header = get_cube_metadata_header(line, rename_duplicates=True)
- header_type: str = get_cube_metadata_header_type(line)
+ header = _get_cube_metadata_header(line, rename_duplicates=True)
+ header_type: str = _get_cube_metadata_header_type(line)
data_block = []
continue
- line_content = parse_cube_data_line(line)
+ line_content = _parse_cube_data_line(line)
data_block.append(line_content)
# the last data block has no header after it so we have to do it here
@@ -157,3 +120,45 @@ def rename_axes(
cube["QEI"].rename(columns=dict(zip(old_cols, new_cols)), inplace=True)
return cube
+
+
+def _is_cube_metadata_header(line: str) -> bool:
+ """Check if a line is a cube metadata header."""
+ return line[0] == "K"
+
+
+def _get_cube_metadata_header_type(line: str) -> str:
+ """Return the header type."""
+ return line.split(";")[1]
+
+
+def _get_cube_metadata_header(
+ line: str, rename_duplicates: bool = False
+) -> list[str]:
+ """Return the metadata header of a cubefile."""
+ raw_header = line.split(";")[2:]
+ raw_header = [
+ name
+ for name in raw_header
+ if name not in ['"nur Werte"', '"mit Werten"']
+ ]
+
+ if not rename_duplicates:
+ return raw_header
+
+ # header can have multiple entries with same label, which is problematic for pandas
+ # so lets just add a counter
+ header = [""] * len(raw_header)
+ for name in set(raw_header):
+ if raw_header.count(name) == 1:
+ header[raw_header.index(name)] = name
+ else:
+ for counter in range(raw_header.count(name)):
+ header[raw_header.index(name) + counter] = f"{name}-{counter+1}"
+
+ return header
+
+
+def _parse_cube_data_line(line: str) -> list[str]:
+ """Return the content of a cube data line."""
+ return line.split(";")[1:]
diff --git a/src/pygenesis/table.py b/src/pygenesis/table.py
index af9b3c4..c2e4a20 100644
--- a/src/pygenesis/table.py
+++ b/src/pygenesis/table.py
@@ -1,35 +1,38 @@
"""Module contains business logic related to destatis tables."""
import pandas as pd
+from pygenesis.cache import cache_data
from pygenesis.csv_helper import get_df_from_text
from pygenesis.http_helper import get_response_from_endpoint
+@cache_data
def get_tablefile_data(
- table_name: str, table_area: str = "all", **kwargs
+ *, name: str, area: str = "all", **kwargs
) -> pd.DataFrame:
- """
+ """Return table file data as pandas data frame.
+
Based on the table name, table area and additional query parameters the
tablefile method from the data-endpoint will be queried.
Args:
- table_name (str): Name of the table
- table_area (str, optional): Area of the table (Defaul: all)
- query_params (dict, optional): Additional query parameters
- (Default: None)
+ name (str): Name of the table.
+ area (str, optional): Area of the table. Defaults to "all".
+
Returns:
- pd.DataFrame
+ pd.DataFrame: Parsed table file.
"""
kwargs = kwargs or {}
params = {
- "name": table_name,
- "area": table_area,
+ "name": name,
+ "area": area,
"format": "ffcsv",
}
params |= kwargs
response = get_response_from_endpoint("data", "tablefile", params)
+
return get_df_from_text(response.text)
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..919436e
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,92 @@
+import time
+from datetime import date
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from pygenesis.cache import cache_data
+from pygenesis.config import (
+ DEFAULT_SETTINGS_FILE,
+ _write_config,
+ init_config,
+ load_settings,
+)
+
+SLEEP_TIME = 0.1
+
+
+@pytest.fixture()
+def cache_dir(tmp_path_factory):
+ return tmp_path_factory.mktemp(".pygenesis")
+
+
+@pytest.fixture(autouse=True)
+def restore_settings():
+ old_settings = load_settings()
+ yield
+ _write_config(old_settings, DEFAULT_SETTINGS_FILE)
+
+
+@cache_data
+def decorated_data(*, name):
+ time.sleep(SLEEP_TIME)
+ return pd.DataFrame(
+ np.random.random(size=(10, 5)), columns=["a", "b", "c", "d", "e"]
+ )
+
+
+def test_cache_data_wrapper(cache_dir):
+ init_config(cache_dir)
+
+ assert len(list((cache_dir / "data").glob("*"))) == 0
+
+ data = decorated_data(name="test_cache_decorator")
+
+ assert isinstance(data, pd.DataFrame)
+ assert not data.empty
+
+ cached_data_file: Path = (
+ cache_dir
+ / "data"
+ / "test_cache_decorator"
+ / str(date.today()).replace("-", "")
+ / "test_cache_decorator.xz"
+ )
+
+ assert cached_data_file.exists() and cached_data_file.is_file()
+
+ objs_in_data = [p for p in cache_dir.joinpath("data").glob("*") if p]
+
+ assert len(objs_in_data) == 1
+ assert objs_in_data[0] == cache_dir / "data" / "test_cache_decorator"
+
+ objs_in_name_dir = [
+ p
+ for p in cache_dir.joinpath("data/test_cache_decorator").glob("*")
+ if p
+ ]
+
+ assert len(objs_in_name_dir) == 1
+ assert objs_in_name_dir[0] == cached_data_file.parent
+
+ restored_data = pd.read_csv(cached_data_file)
+
+ pd.testing.assert_frame_equal(data, restored_data, check_index_type=False)
+
+
+def test_cache_data_twice(cache_dir):
+ init_config(cache_dir)
+
+ load_time = time.perf_counter()
+ data = decorated_data(name="test_cache_decorator")
+ load_time = time.perf_counter() - load_time
+
+ assert load_time >= SLEEP_TIME
+
+ load_time = time.perf_counter()
+ data = decorated_data(name="test_cache_decorator")
+ load_time = time.perf_counter() - load_time
+
+ assert load_time < SLEEP_TIME
diff --git a/tests/test_config.py b/tests/test_config.py
index c0523ae..32c0762 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -16,8 +16,7 @@
@pytest.fixture()
def config_dir(tmp_path_factory):
- config_dir = tmp_path_factory.mktemp(".pygenesis")
- return config_dir
+ return tmp_path_factory.mktemp(".pygenesis")
@pytest.fixture(autouse=True)
@@ -55,12 +54,17 @@ def test_init_config_with_config_dir(config_dir, caplog):
assert caplog.records[1].levelname == "INFO"
assert "Settings file updated" in caplog.text
assert "New config was created" in caplog.text
+ assert (config_dir / "data").exists()
config = load_config()
assert isinstance(config, ConfigParser)
assert len(config.sections()) > 0
+ assert config["DATA"]["cache_dir"] == str(config_dir / "data")
+ assert len(list((config_dir / "data").glob("*"))) == 0
+
config_file = get_config_path_from_settings()
+
assert config_file.exists() and config_file.is_file()