Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 16 additions & 21 deletions polars/groupby-polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import gc
import timeit
import polars as pl
from polars.lazy import col
from polars import col

exec(open("./_helpers/helpers.py").read())

Expand All @@ -23,13 +23,8 @@
print("loading dataset %s" % data_name, flush=True)

with pl.StringCache():
x = pl.read_csv(src_grp, dtype={"id4":pl.Int32, "id5":pl.Int32, "id6":pl.Int32, "v1":pl.Int32, "v2":pl.Int32, "v3":pl.Float64}, low_memory=True)
x["id1"] = x["id1"].cast(pl.Categorical)
x["id1"].shrink_to_fit(in_place=True)
x["id2"] = x["id2"].cast(pl.Categorical)
x["id2"].shrink_to_fit(in_place=True)
x["id3"] = x["id3"].cast(pl.Categorical)
x["id3"].shrink_to_fit(in_place=True)
x = (pl.read_csv(src_grp, dtype={"id4":pl.Int32, "id5":pl.Int32, "id6":pl.Int32, "v1":pl.Int32, "v2":pl.Int32, "v3":pl.Float64}, low_memory=True)
.with_columns(pl.col(["id1", "id2", "id3"]).cast(pl.Categorical)))

in_rows = x.shape[0]
x = x.lazy()
Expand All @@ -42,7 +37,7 @@
question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
ans = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -53,7 +48,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id1").agg(pl.sum("v1")).collect()
ans = x.groupby("id1").agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -68,7 +63,7 @@
question = "sum v1 by id1:id2" # q2
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1")).collect()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -79,7 +74,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1")).collect()
ans = x.groupby(["id1","id2"]).agg(pl.sum("v1").alias("v1_sum")).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -94,7 +89,7 @@
question = "sum v1 mean v3 by id3" # q3
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()
ans = x.groupby("id3").agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -105,7 +100,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id3").agg([pl.sum("v1"), pl.mean("v3")]).collect()
ans = x.groupby("id3").agg([pl.sum("v1").alias("v1_sum"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -120,7 +115,7 @@
question = "mean v1:v3 by id4" # q4
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id4").agg([pl.mean("v1"), pl.mean("v2"), pl.mean("v3")]).collect()
ans = x.groupby("id4").agg([pl.mean("v1").alias("v1_mean"), pl.mean("v2").alias("v2_mean"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -131,7 +126,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id4").agg([pl.mean("v1"), pl.mean("v2"), pl.mean("v3")]).collect()
ans = x.groupby("id4").agg([pl.mean("v1").alias("v1_mean"), pl.mean("v2").alias("v2_mean"), pl.mean("v3").alias("v3_mean")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -146,7 +141,7 @@
question = "sum v1:v3 by id6" # q5
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id6").agg([pl.sum("v1"), pl.sum("v2"), pl.sum("v3")]).collect()
ans = x.groupby("id6").agg([pl.sum("v1").alias("v1_sum"), pl.sum("v2").alias("v2_sum"), pl.sum("v3").alias("v3_sum")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -157,7 +152,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.groupby("id6").agg([pl.sum("v1"), pl.sum("v2"), pl.sum("v3")]).collect()
ans = x.groupby("id6").agg([pl.sum("v1").alias("v1_sum"), pl.sum("v2").alias("v2_sum"), pl.sum("v3").alias("v3_sum")]).collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand Down Expand Up @@ -224,7 +219,7 @@
question = "largest two v3 by id6" # q8
gc.collect()
t_start = timeit.default_timer()
ans = x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(col("v3").head(2).alias("largest2_v3")).explode("largest2_v3").collect()
ans = x.drop_nulls("v3").groupby("id6").agg(col("v3").top_k(2).alias("largest2_v3")).explode("largest2_v3").collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand All @@ -235,7 +230,7 @@
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x.drop_nulls("v3").sort("v3", reverse=True).groupby("id6").agg(col("v3").head(2).alias("largest2_v3")).explode("largest2_v3").collect()
ans = x.drop_nulls("v3").groupby("id6").agg(col("v3").top_k(2).alias("largest2_v3")).explode("largest2_v3").collect()
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
Expand Down Expand Up @@ -299,6 +294,6 @@
print(ans.tail(3), flush=True)
del ans

print("grouping finished, took %0.fs" % (timeit.default_timer() - task_init), flush=True)
print("grouping finished, took %0.3fs" % (timeit.default_timer() - task_init), flush=True)

exit(0)