-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_data.py
More file actions
74 lines (50 loc) · 2.02 KB
/
Copy pathprocess_data.py
File metadata and controls
74 lines (50 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""Data processing module."""
import argparse
import os
from typing import Literal
import pandas as pd
from parse import DATA_DIR
ALL_RESULTS_FILE = "_all_results.csv"
def process_results(mode: Literal["merge"] = "merge") -> None:
"""
Process default results.
:param mode: Mode of processing the data. 'merge' to merge all results and save them in a single file.
"""
merge_dir = os.path.join(DATA_DIR, "results")
all_results = pd.DataFrame()
# Get all dataframes from the directory.
all_results_dataframes_list = get_results_df_list(merge_dir)
if mode == "merge":
all_results = pd.concat(all_results_dataframes_list, ignore_index=True)
else:
raise ValueError("Invalid mode.")
all_results = process_default(all_results)
all_results.to_csv(os.path.join(DATA_DIR, "results", ALL_RESULTS_FILE), encoding="utf-8")
def get_results_df_list(path: str) -> list[pd.DataFrame]:
"""Get a list of dataframes from the directory."""
df_list = []
for file in os.listdir(path):
if file.endswith(".csv") and file != ALL_RESULTS_FILE:
results = pd.read_csv(os.path.join(path, file), encoding="utf-8")
# Add repo name column to the results.
repo_name = file.split(".")[0]
results["repo_name"] = repo_name
df_list.append(results)
return df_list
def process_default(df: pd.DataFrame) -> pd.DataFrame:
"""Default data processing."""
df.drop(columns=["ID"], inplace=True)
df.drop_duplicates(subset="hash", inplace=True)
df.reset_index(drop=True, inplace=True)
df.index.name = "ID"
df["commit_message"] = df["commit_message"].str.replace("\r\n", " ")
return df
def main():
parser = argparse.ArgumentParser(description="Process data.")
parser.add_argument("--mode", "-m", choices=["merge"], help="Mode of processing the data.",)
args = parser.parse_args()
if args.mode == "merge":
process_results()
print("Merged results.")
if __name__ == "__main__":
main()