Skip to content

Commit 1c89aef

Browse files
committed
clean code
1 parent 06eabf3 commit 1c89aef

File tree

4 files changed

+127
-93
lines changed

4 files changed

+127
-93
lines changed

table-sort.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,22 @@
55

66

77
def main(headers):
8-
rows = csv.DictReader(sys.stdin, delimiter='\t', dialect='excel-tab')
8+
rows = csv.DictReader(sys.stdin, delimiter="\t", dialect="excel-tab")
99

10-
# More efficient header check using set intersection
11-
if not set(headers).intersection(rows.fieldnames):
12-
raise ValueError(f"Couldn't find any of supplied headers ({', '.join(map(repr, headers))}) in the table.")
10+
# More efficient header check using set intersection
11+
if not set(headers).intersection(rows.fieldnames):
12+
raise ValueError(
13+
f"Couldn't find any of supplied headers ({', '.join(map(repr, headers))}) in the table."
14+
)
1315

14-
# Optimized sorting using tuple comparison (generally faster than list comparison)
15-
items = sorted(rows, key=lambda d: tuple(d.get(h, "") for h in headers))
16+
# Optimized sorting using tuple comparison (generally faster than list comparison)
17+
items = sorted(rows, key=lambda d: tuple(d.get(h, "") for h in headers))
1618

17-
wr = csv.DictWriter(sys.stdout, dialect='excel-tab', fieldnames=rows.fieldnames)
18-
wr.writeheader()
19-
wr.writerows(items)
20-
# sys.stdout.flush()
19+
wr = csv.DictWriter(sys.stdout, dialect="excel-tab", fieldnames=rows.fieldnames)
20+
wr.writeheader()
21+
wr.writerows(items)
22+
# sys.stdout.flush()
2123

2224

23-
if __name__ == '__main__':
24-
main(sys.argv[1:])
25+
if __name__ == "__main__":
26+
main(sys.argv[1:])

table-summarize.py

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,28 +8,32 @@
88

99

1010
def main(table):
11-
with open(table, 'r', newline='', encoding='utf-8') as table_f: # Improved file opening
12-
rdr = csv.DictReader(table_f, delimiter='\t', dialect='excel')
13-
14-
# Check if fieldnames exist before proceeding to avoid potential errors
15-
if not rdr.fieldnames or len(rdr.fieldnames) <= 1:
16-
print("No data columns found in the table.")
17-
return
18-
19-
summary = OrderedDict()
20-
for row in rdr: # Iterate directly without creating a list in memory
21-
for name in rdr.fieldnames[1:]:
22-
summary.setdefault(name, Counter()).update([row[name]]) # More efficient counting
23-
24-
total = rdr.line_num - 1 # get the number of rows
25-
26-
print("Summary:")
27-
for name, results in summary.items():
28-
print(f'{name}:') # f-string
29-
for result, num in results.items():
30-
if result:
31-
print(f"\t - {result}: {num} of {total}") # f-string
32-
33-
34-
if __name__ == '__main__':
35-
main(sys.argv[1])
11+
with open(
12+
table, "r", newline="", encoding="utf-8"
13+
) as table_f: # Improved file opening
14+
rdr = csv.DictReader(table_f, delimiter="\t", dialect="excel")
15+
16+
# Check if fieldnames exist before proceeding to avoid potential errors
17+
if not rdr.fieldnames or len(rdr.fieldnames) <= 1:
18+
print("No data columns found in the table.")
19+
return
20+
21+
summary = OrderedDict()
22+
for row in rdr: # Iterate directly without creating a list in memory
23+
for name in rdr.fieldnames[1:]:
24+
summary.setdefault(name, Counter()).update(
25+
[row[name]]
26+
) # More efficient counting
27+
28+
total = rdr.line_num - 1 # get the number of rows
29+
30+
print("Summary:")
31+
for name, results in summary.items():
32+
print(f"{name}:") # f-string
33+
for result, num in results.items():
34+
if result:
35+
print(f"\t - {result}: {num} of {total}") # f-string
36+
37+
38+
if __name__ == "__main__":
39+
main(sys.argv[1])

table-union.py

Lines changed: 53 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -5,49 +5,56 @@
55

66

77
def main(unionize=True, *files):
8-
header = []
9-
items = []
10-
possible_identity_headers = None
11-
12-
for fi in files:
13-
with open(fi, 'r', newline='', encoding='utf-8') as table: # Improved file opening
14-
reader = csv.DictReader(table, delimiter='\t', dialect='excel-tab')
15-
16-
# Efficient header update using set operations
17-
header_set = set(header)
18-
new_headers = [field for field in reader.fieldnames if field not in header_set]
19-
header.extend(new_headers)
20-
21-
rows = list(reader) # Keep this for now, but see optimization below
22-
if not rows: # skip empty files
23-
continue
24-
25-
# More efficient identity header detection
26-
if possible_identity_headers is None:
27-
possible_identity_headers = set(reader.fieldnames)
28-
29-
# Optimized identity header filtering
30-
possible_identity_headers.intersection_update(
31-
f for f in reader.fieldnames
32-
if
33-
len({row[f] for row in rows if f in row}) == len(rows) and all(row.get(f) is not None for row in rows)
34-
)
35-
items.extend(rows)
36-
37-
if possible_identity_headers and unionize:
38-
key_column = possible_identity_headers.pop()
39-
# More efficient merging using defaultdict
40-
merged_rows = defaultdict(dict)
41-
for row in items:
42-
key = row.get(key_column)
43-
if key is not None: # skip rows with null keys
44-
merged_rows[key].update(row)
45-
items = list(merged_rows.values())
46-
47-
wr = csv.DictWriter(sys.stdout, delimiter='\t', dialect='excel-tab', fieldnames=header)
48-
wr.writeheader()
49-
wr.writerows(items)
50-
51-
52-
if __name__ == '__main__':
53-
main(*sys.argv[1:])
8+
header = []
9+
items = []
10+
possible_identity_headers = None
11+
12+
for fi in files:
13+
with open(
14+
fi, "r", newline="", encoding="utf-8"
15+
) as table: # Improved file opening
16+
reader = csv.DictReader(table, delimiter="\t", dialect="excel-tab")
17+
18+
# Efficient header update using set operations
19+
header_set = set(header)
20+
new_headers = [
21+
field for field in reader.fieldnames if field not in header_set
22+
]
23+
header.extend(new_headers)
24+
25+
rows = list(reader) # Keep this for now, but see optimization below
26+
if not rows: # skip empty files
27+
continue
28+
29+
# More efficient identity header detection
30+
if possible_identity_headers is None:
31+
possible_identity_headers = set(reader.fieldnames)
32+
33+
# Optimized identity header filtering
34+
possible_identity_headers.intersection_update(
35+
f
36+
for f in reader.fieldnames
37+
if len({row[f] for row in rows if f in row}) == len(rows)
38+
and all(row.get(f) is not None for row in rows)
39+
)
40+
items.extend(rows)
41+
42+
if possible_identity_headers and unionize:
43+
key_column = possible_identity_headers.pop()
44+
# More efficient merging using defaultdict
45+
merged_rows = defaultdict(dict)
46+
for row in items:
47+
key = row.get(key_column)
48+
if key is not None: # skip rows with null keys
49+
merged_rows[key].update(row)
50+
items = list(merged_rows.values())
51+
52+
wr = csv.DictWriter(
53+
sys.stdout, delimiter="\t", dialect="excel-tab", fieldnames=header
54+
)
55+
wr.writeheader()
56+
wr.writerows(items)
57+
58+
59+
if __name__ == "__main__":
60+
main(*sys.argv[1:])

test_tables_ops.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import subprocess
33
import os
44
import csv
5-
from collections import Counter
65

76

87
class TestTableOps(unittest.TestCase):
@@ -20,29 +19,42 @@ def _run_command(self, command, input_data=None):
2019
return process.returncode, stdout, stderr
2120

2221
def _compare_tsv(self, expected_file, actual_output):
23-
with open(os.path.join(self.TEST_DATA_DIR, expected_file), 'r', encoding='utf-8') as f:
24-
expected_lines = list(csv.reader(f, delimiter='\t'))
25-
actual_lines = list(csv.reader(actual_output.splitlines(), delimiter='\t'))
22+
with open(
23+
os.path.join(self.TEST_DATA_DIR, expected_file), "r", encoding="utf-8"
24+
) as f:
25+
expected_lines = list(csv.reader(f, delimiter="\t"))
26+
actual_lines = list(csv.reader(actual_output.splitlines(), delimiter="\t"))
2627
self.assertEqual(expected_lines, actual_lines)
2728

2829
def test_table_union_union(self):
2930
returncode, stdout, stderr = self._run_command(
30-
["table-union", os.path.join(self.TEST_DATA_DIR, "dingbat.tsv"), os.path.join(self.TEST_DATA_DIR, "loki.tsv")]
31+
[
32+
"table-union",
33+
os.path.join(self.TEST_DATA_DIR, "dingbat.tsv"),
34+
os.path.join(self.TEST_DATA_DIR, "loki.tsv"),
35+
]
3136
)
3237
self.assertEqual(returncode, 0)
3338
self._compare_tsv("combined.tsv", stdout)
3439
self.assertEqual(stderr, "")
3540

3641
def test_table_union_join(self):
3742
returncode, stdout, stderr = self._run_command(
38-
["table-union", "--no-union", os.path.join(self.TEST_DATA_DIR, "users.tsv"), os.path.join(self.TEST_DATA_DIR, "orders.tsv")]
43+
[
44+
"table-union",
45+
"--no-union",
46+
os.path.join(self.TEST_DATA_DIR, "users.tsv"),
47+
os.path.join(self.TEST_DATA_DIR, "orders.tsv"),
48+
]
3949
)
4050
self.assertEqual(returncode, 0)
4151
self._compare_tsv("merged_expected.tsv", stdout)
4252
self.assertEqual(stderr, "")
4353

4454
def test_table_summarize(self):
45-
returncode, stdout, stderr = self._run_command(["table-summarize", os.path.join(self.TEST_DATA_DIR, "data_summarize.tsv")])
55+
returncode, stdout, stderr = self._run_command(
56+
["table-summarize", os.path.join(self.TEST_DATA_DIR, "data_summarize.tsv")]
57+
)
4658
self.assertEqual(returncode, 0)
4759

4860
expected_summary = """Summary:
@@ -63,14 +75,23 @@ def test_table_summarize(self):
6375

6476
def test_table_sort(self):
6577
returncode, stdout, stderr = self._run_command(
66-
["table-sort", "-k", "Age", "-k", "Name", os.path.join(self.TEST_DATA_DIR, "data_sort.tsv")]
78+
[
79+
"table-sort",
80+
"-k",
81+
"Age",
82+
"-k",
83+
"Name",
84+
os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"),
85+
]
6786
)
6887
self.assertEqual(returncode, 0)
6988
self._compare_tsv("sorted_data_expected.tsv", stdout)
7089
self.assertEqual(stderr, "")
7190

7291
def test_table_sort_pipe(self):
73-
with open(os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), 'r', encoding="utf-8") as infile:
92+
with open(
93+
os.path.join(self.TEST_DATA_DIR, "data_sort.tsv"), "r", encoding="utf-8"
94+
) as infile:
7495
input_data = infile.read()
7596
returncode, stdout, stderr = self._run_command(
7697
["table-sort", "-k", "Age", "-k", "Name"], input_data
@@ -81,4 +102,4 @@ def test_table_sort_pipe(self):
81102

82103

83104
if __name__ == "__main__":
84-
unittest.main()
105+
unittest.main()

0 commit comments

Comments
 (0)