Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix COUNT(*) ERROR for PDF SELECT Queries #1376

Merged
merged 5 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions evadb/parser/lark_visitor/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ def aggregate_windowed_function(self, tree):
# Support for COUNT(*)
if token != "*":
agg_func_name = token
elif token == "*":
agg_func_arg = TupleValueExpression(name="_row_id")
else:
agg_func_arg = TupleValueExpression(name="id")

Expand Down
Binary file not shown.
32 changes: 32 additions & 0 deletions test/integration_tests/short/test_select_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,38 @@ def test_select_and_groupby_with_sample(self):
expected_batch.project(["FIRST.id", "SEGMENT.data"]),
)

def test_select_and_groupby_and_aggregate_with_pdf(self):
GROUPBY_SIZE = 8
execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;")
# load from directory
pdf_path = (
"test/data/uadetrac/small-data/pdf_data/fall_2023_orientation_document.pdf"
)
load_query = f"LOAD PDF '{pdf_path}' INTO MyPDFs;"
execute_query_fetch_all(self.evadb, load_query)
select_all_query = "SELECT * FROM MyPDFs;"
all_pdf_batch = execute_query_fetch_all(self.evadb, select_all_query)

select_query = (
f"SELECT COUNT(*) FROM MyPDFs GROUP BY '{GROUPBY_SIZE} paragraphs';"
)
actual_batch = execute_query_fetch_all(self.evadb, select_query)

self.assertAlmostEqual(
len(all_pdf_batch),
len(actual_batch) * actual_batch.frames.iloc[0, 0],
None,
None,
GROUPBY_SIZE,
)
self.assertEqual(len(actual_batch), 99)
n = len(actual_batch)
for i in range(n):
self.assertEqual(actual_batch.frames.iloc[i, 0], GROUPBY_SIZE)

# tear down
execute_query_fetch_all(self.evadb, "DROP TABLE IF EXISTS MyPDFs;")

def test_lateral_join_with_unnest_and_sample(self):
query = """SELECT id, label
FROM MyVideo SAMPLE 2 JOIN LATERAL
Expand Down