Merge pull request #13 from jrasband-dev/dev-0.5.0

jrasband-dev · web-flow · commit e351ee3feb83 · 2025-01-17T15:01:18.000-07:00
Dev 0.5.0
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 dist
 __pycache__
 polars_extensions.egg-info
-venv
+.venv
 .codegpt
diff --git a/datasets/string_sim.csv b/datasets/string_sim.csv
@@ -0,0 +1,14 @@
+a,c
+apple,appl
+banana,BANANA
+cherry,cherr
+date,etad
+elderberry,elderberrys
+fig,FIG
+grape,gr@pe
+honeydew,ywendeyoh
+kiwi,KIW
+lemon,lemons
+mangoes are Tangy,mango are Tangy
+it was the best of times,it was the worst of times
+of times it was the best,it was the worst of times
diff --git a/polars_extensions/__init__.py b/polars_extensions/__init__.py
@@ -1,10 +1,12 @@
 from .io import *
 from .name import *
 from .numeric import *
+from .string import *
 
 __all__ = [
     "NameExtensionNameSpace",
     "NumericExtensionNamespace",
+    "StringExtensionNamespace",
     "write_schema",
     "read_schema",
 ]
diff --git a/polars_extensions/io.py b/polars_extensions/io.py
@@ -1,10 +1,7 @@
 import json
 from typing import Union
-import ast
-
 import polars as pl
 
-
 def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
     "Saves a Polars schema a JSON file"
     if isinstance(schema, pl.DataFrame):
@@ -17,12 +14,17 @@ def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
         json.dump(schema_dict, f)
     return
 
-
 def read_schema(file: str):
     "Opens a JSON Schema file and return a Polars Schema object"
-    f = open(file, "r")
-    schema = json.load(f)
-    f.close()
-    schema_dict = {k: ast.literal_eval(f"pl.{v}") for k, v in schema.items()}
+    with open(file, "r") as f:
+        schema = json.load(f)
+    
+    schema_dict = {}
+    for k, v in schema.items():
+        try:
+            schema_dict[k] = getattr(pl, v)
+        except AttributeError:
+            raise ValueError(f"Invalid type {v} for column {k}")
+    
     schema_object = pl.Schema(schema_dict)
-    return schema_object
+    return schema_object
diff --git a/polars_extensions/string.py b/polars_extensions/string.py
@@ -0,0 +1,58 @@
+import polars as pl
+
+@pl.api.register_dataframe_namespace("str_ext")
+class StringExtensionNamespace:
+    """String Extensions for the Polars Library"""
+
+    def __init__(self, df: pl.DataFrame):
+        self._df = df
+
+    def f1_string_similarity(self, col_a: str, col_b: str) -> pl.DataFrame:
+        """
+        Calculates a similarity score between two columns of strings based on common characters,
+        accounting for repeated characters.
+        
+        Parameters:
+        col_a (str): The name of the first column to compare.
+        col_b (str): The name of the second column to compare.
+        
+        Returns:
+        pl.DataFrame: A DataFrame with the similarity scores as a new column.
+        """
+
+        def similarity(row_str_a: str, row_str_b: str) -> float:
+            # Normalize both strings (case-insensitive comparison)
+            row_str_a = row_str_a.lower()
+            row_str_b = row_str_b.lower()
+
+            # If strings are identical, return a score of 1.0
+            if row_str_a == row_str_b:
+                return 1.0
+
+            list1 = list(row_str_a)
+            list2 = list(row_str_b)
+
+            list2_copy = list2[:]
+            intersection = []
+
+            # Account for repeated characters by checking all occurrences
+            for char in list1:
+                if char in list2_copy:
+                    intersection.append(char)
+                    list2_copy.remove(char)
+            
+            common_chars = len(intersection)
+            total_chars = len(list1) + len(list2)
+            return (2 * common_chars) / total_chars if total_chars > 0 else 0.0
+
+        # Apply the similarity function row-by-row
+        similarity_scores = [
+            similarity(row_a, row_b) for row_a, row_b in zip(self._df[col_a], self._df[col_b])
+        ]
+
+        # Add the similarity scores as a new column to the DataFrame
+        self._df = self._df.with_columns(
+            pl.Series("f1_score", similarity_scores)
+        )
+
+        return self._df
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "polars-extensions"
-version = "0.4.0"
+version = "0.5.0"
 description = "The Library of Polars Extensions"
 readme = "README.md"
 authors = [
diff --git a/usage.ipynb b/usage.ipynb
diff --git a/uv.lock b/uv.lock