Skip to content

Commit e351ee3

Browse files
authored
Merge pull request #13 from jrasband-dev/dev-0.5.0
Dev 0.5.0
2 parents cd4a87f + 3a965aa commit e351ee3

File tree

8 files changed

+310
-36
lines changed

8 files changed

+310
-36
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
dist
22
__pycache__
33
polars_extensions.egg-info
4-
venv
4+
.venv
55
.codegpt

datasets/string_sim.csv

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
a,c
2+
apple,appl
3+
banana,BANANA
4+
cherry,cherr
5+
date,etad
6+
elderberry,elderberrys
7+
fig,FIG
8+
grape,gr@pe
9+
honeydew,ywendeyoh
10+
kiwi,KIW
11+
lemon,lemons
12+
mangoes are Tangy,mango are Tangy
13+
it was the best of times,it was the worst of times
14+
of times it was the best,it was the worst of times

polars_extensions/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from .io import *
22
from .name import *
33
from .numeric import *
4+
from .string import *
45

56
__all__ = [
67
"NameExtensionNameSpace",
78
"NumericExtensionNamespace",
9+
"StringExtensionNamespace",
810
"write_schema",
911
"read_schema",
1012
]

polars_extensions/io.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
import json
22
from typing import Union
3-
import ast
4-
53
import polars as pl
64

7-
85
def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
96
"Saves a Polars schema a JSON file"
107
if isinstance(schema, pl.DataFrame):
@@ -17,12 +14,17 @@ def write_schema(schema: Union[pl.DataFrame, pl.Schema], file: str):
1714
json.dump(schema_dict, f)
1815
return
1916

20-
2117
def read_schema(file: str):
2218
"Opens a JSON Schema file and return a Polars Schema object"
23-
f = open(file, "r")
24-
schema = json.load(f)
25-
f.close()
26-
schema_dict = {k: ast.literal_eval(f"pl.{v}") for k, v in schema.items()}
19+
with open(file, "r") as f:
20+
schema = json.load(f)
21+
22+
schema_dict = {}
23+
for k, v in schema.items():
24+
try:
25+
schema_dict[k] = getattr(pl, v)
26+
except AttributeError:
27+
raise ValueError(f"Invalid type {v} for column {k}")
28+
2729
schema_object = pl.Schema(schema_dict)
28-
return schema_object
30+
return schema_object

polars_extensions/string.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import polars as pl
2+
3+
@pl.api.register_dataframe_namespace("str_ext")
4+
class StringExtensionNamespace:
5+
"""String Extensions for the Polars Library"""
6+
7+
def __init__(self, df: pl.DataFrame):
8+
self._df = df
9+
10+
def f1_string_similarity(self, col_a: str, col_b: str) -> pl.DataFrame:
11+
"""
12+
Calculates a similarity score between two columns of strings based on common characters,
13+
accounting for repeated characters.
14+
15+
Parameters:
16+
col_a (str): The name of the first column to compare.
17+
col_b (str): The name of the second column to compare.
18+
19+
Returns:
20+
pl.DataFrame: A DataFrame with the similarity scores as a new column.
21+
"""
22+
23+
def similarity(row_str_a: str, row_str_b: str) -> float:
24+
# Normalize both strings (case-insensitive comparison)
25+
row_str_a = row_str_a.lower()
26+
row_str_b = row_str_b.lower()
27+
28+
# If strings are identical, return a score of 1.0
29+
if row_str_a == row_str_b:
30+
return 1.0
31+
32+
list1 = list(row_str_a)
33+
list2 = list(row_str_b)
34+
35+
list2_copy = list2[:]
36+
intersection = []
37+
38+
# Account for repeated characters by checking all occurrences
39+
for char in list1:
40+
if char in list2_copy:
41+
intersection.append(char)
42+
list2_copy.remove(char)
43+
44+
common_chars = len(intersection)
45+
total_chars = len(list1) + len(list2)
46+
return (2 * common_chars) / total_chars if total_chars > 0 else 0.0
47+
48+
# Apply the similarity function row-by-row
49+
similarity_scores = [
50+
similarity(row_a, row_b) for row_a, row_b in zip(self._df[col_a], self._df[col_b])
51+
]
52+
53+
# Add the similarity scores as a new column to the DataFrame
54+
self._df = self._df.with_columns(
55+
pl.Series("f1_score", similarity_scores)
56+
)
57+
58+
return self._df

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "polars-extensions"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
description = "The Library of Polars Extensions"
55
readme = "README.md"
66
authors = [

0 commit comments

Comments
 (0)