Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting spaces in column names for csv files #1388

Merged
merged 5 commits into from
Dec 3, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions evadb/parser/lark_visitor/_common_clauses_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ def full_id(self, tree):
return (self.visit(tree.children[0]), self.visit(tree.children[1]))

def uid(self, tree):
if (
hasattr(tree.children[0], "type")
and tree.children[0].type == "REVERSE_QUOTE_ID"
):
tree.children[0].type = "simple_id"
non_tick_string = str(tree.children[0]).replace("`", "")
return non_tick_string
return self.visit(tree.children[0])

def full_column_name(self, tree):
Expand Down
44 changes: 44 additions & 0 deletions test/integration_tests/short/test_load_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import unittest
from pathlib import Path
from test.util import (
create_csv_with_comlumn_name_spaces,
create_dummy_csv_batches,
create_sample_csv,
create_sample_video,
Expand Down Expand Up @@ -117,6 +118,49 @@ def test_should_load_csv_in_table(self):
drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
execute_query_fetch_all(self.evadb, drop_query)

###################################
# integration tests for csv files with spaces in column names
def test_should_load_csv_in_table_with_spaces_in_column_name(self):
# loading a csv requires a table to be created first
create_table_query = """

CREATE TABLE IF NOT EXISTS MyVideoCSV (
id INTEGER UNIQUE,
`frame id` INTEGER,
`video id` INTEGER,
`dataset name` TEXT(30),
label TEXT(30),
bbox NDARRAY FLOAT32(4),
`object id` INTEGER
);

"""
execute_query_fetch_all(self.evadb, create_table_query)

# load the CSV
load_query = (
f"LOAD CSV '{create_csv_with_comlumn_name_spaces()}' INTO MyVideoCSV;"
)
execute_query_fetch_all(self.evadb, load_query)

# execute a select query
select_query = """SELECT id, `frame id`, `video id`,
`dataset name`, label, bbox,
`object id`
FROM MyVideoCSV;"""

actual_batch = execute_query_fetch_all(self.evadb, select_query)
actual_batch.sort()

# assert the batches are equal
expected_batch = next(create_dummy_csv_batches())
expected_batch.modify_column_alias("myvideocsv")
self.assertEqual(actual_batch, expected_batch)

# clean up
drop_query = "DROP TABLE IF EXISTS MyVideoCSV;"
execute_query_fetch_all(self.evadb, drop_query)


if __name__ == "__main__":
unittest.main()
31 changes: 31 additions & 0 deletions test/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,37 @@ def create_sample_csv(num_frames=NUM_FRAMES):
return os.path.join(get_tmp_dir(), "dummy.csv")


def create_csv_with_comlumn_name_spaces(num_frames=NUM_FRAMES):
try:
os.remove(os.path.join(get_tmp_dir(), "dummy.csv"))
except FileNotFoundError:
pass

sample_meta = {}

index = 0
sample_labels = ["car", "pedestrian", "bicycle"]
num_videos = 2
for video_id in range(num_videos):
for frame_id in range(num_frames):
random_coords = 200 + 300 * np.random.random(4)
sample_meta[index] = {
"id": index,
"frame id": frame_id,
"video id": video_id,
"dataset name": "test_dataset",
"label": sample_labels[np.random.choice(len(sample_labels))],
"bbox": ",".join([str(coord) for coord in random_coords]),
"object id": np.random.choice(3),
}

index += 1

df_sample_meta = pd.DataFrame.from_dict(sample_meta, "index")
df_sample_meta.to_csv(os.path.join(get_tmp_dir(), "dummy.csv"), index=False)
return os.path.join(get_tmp_dir(), "dummy.csv")


def create_dummy_csv_batches(target_columns=None):
if target_columns:
df = pd.read_csv(
Expand Down