From 54e4843ebc99ae6e1cf49117ccb0371afb0792ff Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Mon, 5 Oct 2020 11:23:17 -0400 Subject: [PATCH 1/2] Add index position informatin to column class (#11) --- histore/archive/schema.py | 4 +++- histore/document/mem/json.py | 11 ++++++++-- histore/document/schema.py | 20 ++++++++++++++----- histore/version.py | 2 +- .../manager/test_persistent_manager.py | 6 ++++++ tests/document/test_document_schema.py | 8 ++++++++ tests/document/test_json_document.py | 20 +++++++++++++++++-- 7 files changed, 60 insertions(+), 11 deletions(-) diff --git a/histore/archive/schema.py b/histore/archive/schema.py index ecdd6cb..4f1c83b 100644 --- a/histore/archive/schema.py +++ b/histore/archive/schema.py @@ -223,7 +223,9 @@ def at_version(self, version): # Sort columns based on their position and return a list of snapshot # columns. cols.sort(key=lambda x: x[2]) - return [Column(colid=id, name=name) for id, name, _ in cols] + return [ + Column(colid=id, name=name, colidx=pos) for id, name, pos in cols + ] def merge( self, columns, version, matching=MATCH_IDNAME, renamed=None, diff --git a/histore/document/mem/json.py b/histore/document/mem/json.py index 43808e7..a30aec3 100644 --- a/histore/document/mem/json.py +++ b/histore/document/mem/json.py @@ -87,9 +87,16 @@ def __init__(self, doc, validate=True): columns = list() for obj in doc['columns']: if isinstance(obj, dict): - columns.append(Column(colid=obj['id'], name=obj['name'])) + col = Column( + colid=obj['id'], + name=obj['name'], + colidx=len(columns) + ) else: - columns.append(obj) + # Assumes that the object is a scalar value (string or number) + # representing the column name. + col = Column(colid=-1, name=obj, colidx=len(columns)) + columns.append(col) # Get the document rows. rows = doc['data'] # Create the keys for the document rows. diff --git a/histore/document/schema.py b/histore/document/schema.py index b6ebc13..0f358eb 100644 --- a/histore/document/schema.py +++ b/histore/document/schema.py @@ -10,16 +10,21 @@ column value in a Pandas data frame. """ +from typing import List, Optional, Union + +"""Type alias for column lists.""" +Columns = Union[str, int, List[Union[int, str]]] + class Column(str): - """Columns in openclean data frames are subclasses of Python strings that + """Columns in histore data frames are subclasses of Python strings that contain a unique column identifier. This implementation is based on: https://bytes.com/topic/python/answers/32098-my-experiences-subclassing-string The order of creation is that the __new__ method is called which returns the object then __init__ is called. """ - def __new__(cls, colid, name, *args, **keywargs): + def __new__(cls, colid: int, name: str, colidx: Optional[int] = None): """Initialize the String object with the given column name. Ignore the column identifier. @@ -29,10 +34,12 @@ def __new__(cls, colid, name, *args, **keywargs): Unique column identifier name: string Column name + colidx: int, default=None + Index position of the column in a dataset schema. """ return str.__new__(cls, str(name)) - def __init__(self, colid, name): + def __init__(self, colid: int, name: str, colidx: Optional[int] = None): """Initialize the unique column identifier. The column name has already been initialized by the __new__ method that is called prior to the __init__ method. @@ -43,14 +50,17 @@ def __init__(self, colid, name): Unique column identifier name: string Column name + colidx: int, default=None + Index position of the column in a dataset schema. """ self.colid = colid + self.colidx = colidx # -- Helper methods ----------------------------------------------------------- -def column_index(schema, columns): +def column_index(schema: List[str], columns: Columns): """Get the list of column index positions in a given schema (list of column names). Columns are either specified by name or by index position. @@ -63,7 +73,7 @@ def column_index(schema, columns): ---------- schema: list(string) List of column names. - columns: list(int or str) + columns: int, str, or list(int or str) List of column index positions or column names. Returns diff --git a/histore/version.py b/histore/version.py index d6049c3..3b5180a 100644 --- a/histore/version.py +++ b/histore/version.py @@ -6,4 +6,4 @@ # file LICENSE for full license details. """Code version information for histore.""" -__version__ = '0.1.3' +__version__ = '0.1.4' diff --git a/tests/archive/manager/test_persistent_manager.py b/tests/archive/manager/test_persistent_manager.py index 3a53513..444adcf 100644 --- a/tests/archive/manager/test_persistent_manager.py +++ b/tests/archive/manager/test_persistent_manager.py @@ -17,6 +17,7 @@ from histore.archive.manager.db.database import DB, TEST_URL from histore.archive.manager.fs import FileSystemArchiveManager from histore.archive.manager.persist import PersistentArchiveManager +from histore.document.schema import Column import histore.config as config @@ -93,6 +94,11 @@ def test_encoder_default(ManagerCls, kwargs, tmpdir): assert df.shape == (1, 1) assert df.iloc[0][0] == dt assert isinstance(df.iloc[0][0], datetime) + # DataFrane schema + for col in df.columns: + assert isinstance(col, Column) + assert col.colid >= 0 + assert col.colidx >= 0 @pytest.mark.parametrize( diff --git a/tests/document/test_document_schema.py b/tests/document/test_document_schema.py index 9428bdc..d52a842 100644 --- a/tests/document/test_document_schema.py +++ b/tests/document/test_document_schema.py @@ -32,7 +32,15 @@ def test_column_index(): def test_document_columns(): """Test creating instances of document schema columns.""" + # -- Column without index position ---------------------------------------- col = Column(colid=1, name='my_col') assert col == 'my_col' assert isinstance(col, str) assert col.colid == 1 + assert col.colidx is None + # -- Column with index position ------------------------------------------- + col = Column(colid=1, name='my_col', colidx=10) + assert col == 'my_col' + assert isinstance(col, str) + assert col.colid == 1 + assert col.colidx == 10 diff --git a/tests/document/test_json_document.py b/tests/document/test_json_document.py index 9fd71db..ac03048 100644 --- a/tests/document/test_json_document.py +++ b/tests/document/test_json_document.py @@ -32,6 +32,13 @@ def test_json_document_without_key(): doc = JsonDocument( doc={'columns': ['Name', 'Age'], 'data': [['Bob', 23], ['Alice', 24]]} ) + # -- Test schema identifier and index ------------------------------------- + columns = dict() + for col in doc.columns: + columns[col] = (col.colid, col.colidx) + assert columns['Name'] == (-1, 0) + assert columns['Age'] == (-1, 1) + # -- Test row values and positions ---------------------------------------- reader = doc.reader(schema=[Column(0, 'Name'), Column(1, 'Age')]) keys, positions, names = list(), list(), list() while reader.has_next(): @@ -46,13 +53,22 @@ def test_json_document_without_key(): def test_json_document_with_pk(): """Test creating an instance of the Json document with a primary key.""" - SCHEMA = [{'id': 1, 'name': 'Name'}, {'id': 0, 'name': 'Age'}] doc = JsonDocument( doc={ - 'columns': SCHEMA, + 'columns': [ + {'id': 1, 'name': 'Name'}, + {'id': 0, 'name': 'Age'} + ], 'data': [['Bob', 23], ['Alice', 24]], 'primaryKey': ['Name']} ) + # -- Test schema identifier and index ------------------------------------- + columns = dict() + for col in doc.columns: + columns[col] = (col.colid, col.colidx) + assert columns['Name'] == (1, 0) + assert columns['Age'] == (0, 1) + # -- Test row values and positions ---------------------------------------- reader = doc.reader(schema=doc.columns) keys, positions, names = list(), list(), list() while reader.has_next(): From c9bb6026003c00ea425bacbe86fa76c250076593 Mon Sep 17 00:00:00 2001 From: Heiko Mueller Date: Wed, 7 Oct 2020 09:48:18 -0400 Subject: [PATCH 2/2] Version 0.1.4 --- changelog.md | 5 +++++ tests/archive/manager/test_persistent_manager.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 42678ce..0da2d10 100644 --- a/changelog.md +++ b/changelog.md @@ -23,3 +23,8 @@ ### 0.1.3 - 2020-10-05 * Add archive manager that maintains descriptors in a relational database (\#8) + + +### 0.1.4 - 2020-10-07 + +* Add index position information to column class (\#11) diff --git a/tests/archive/manager/test_persistent_manager.py b/tests/archive/manager/test_persistent_manager.py index 444adcf..fedb50b 100644 --- a/tests/archive/manager/test_persistent_manager.py +++ b/tests/archive/manager/test_persistent_manager.py @@ -37,7 +37,7 @@ def test_create_archive(ManagerCls, kwargs, tmpdir): # -- Create empty manager instance ---------------------------------------- manager = ManagerCls(**kwargs) assert len(manager.archives()) == 0 - # -- Ad first archive ----------------------------------------------------- + # -- Add irst archive ----------------------------------------------------- descriptor = manager.create( name='First archive', description='My first archive',