diff --git a/src/clm/commands/collect_tabulated_molecules.py b/src/clm/commands/collect_tabulated_molecules.py index 3ed15db7..ff7f1d3a 100644 --- a/src/clm/commands/collect_tabulated_molecules.py +++ b/src/clm/commands/collect_tabulated_molecules.py @@ -43,18 +43,27 @@ def collect_tabulated_molecules( # `size` column denoting the frequency of occurrence of each combination. # For each unique key, select the most sampled canonical smile. df.sort_values(by=["size"], ascending=False, inplace=True) + df["ik14"] = df["inchikey"].str[:14] - # May need to later identify subset with + or - in row, - # apply sanitization step to generate cleaned smiles, inchikey + unique = ( + df.groupby("ik14", sort=False) + .agg( + size=("size", "sum"), + **{ + col: (col, "first") + for col in df.columns + if col not in ("ik14", "size") + }, + ) + .reset_index(drop=True) + ) - # Add inchikey14 and group by this instead - df["ik14"] = df["inchikey"].astype(str).str.split("-", n=1).str[0] + unique = ( + unique.sort_values("size", ascending=False, kind="stable").reset_index( + drop=True + ) + )[["inchikey", "mass", "formula", "smiles", "size"]] - unique = df.groupby(["ik14"]).first().reset_index() - unique["size"] = ( - df.groupby(["ik14"]).agg({"size": "sum"}).reset_index(drop=True) - ) - unique.drop(columns=["ik14"], inplace=True) write_to_csv_file(output_file, unique) # Known smiles are all the sampled smiles found in training set, @@ -66,11 +75,8 @@ def collect_tabulated_molecules( [read_csv_file(file, delimiter=",") for file in known_smiles], ignore_index=True, ) - unique_known = known_df.groupby(["smiles"]).first().reset_index() - unique_known["size"] = ( - known_df.groupby(["smiles"]) - .agg({"size": "sum"}) - .reset_index(drop=True) + unique_known = ( + known_df.groupby("smiles").agg(size=("size", "sum")).reset_index() ) write_to_csv_file( os.path.join( @@ -85,11 +91,8 @@ def collect_tabulated_molecules( [read_csv_file(file, delimiter=",") for file in invalid_smiles], ignore_index=True, ) - unique_invalid = invalid_df.groupby(["smiles"]).first().reset_index() - unique_invalid["size"] = ( - invalid_df.groupby(["smiles"]) - .agg({"size": "sum"}) - .reset_index(drop=True) + unique_invalid = ( + invalid_df.groupby("smiles").agg(size=("size", "sum")).reset_index() ) write_to_csv_file( os.path.join( diff --git a/src/clm/commands/tabulate_molecules.py b/src/clm/commands/tabulate_molecules.py index 70508901..207bcbe2 100644 --- a/src/clm/commands/tabulate_molecules.py +++ b/src/clm/commands/tabulate_molecules.py @@ -92,21 +92,32 @@ def tabulate_molecules(input_file, train_file, representation, output_file): # Find unique combinations of inchikey, mass, and formula, and add a # `size` column denoting the frequency of occurrence of each combination. # For each unique combination, select the largest sized canonical smile by ik14. - - freqs["ik14"] = freqs["inchikey"].astype(str).str.split("-", n=1).str[0] - unique = freqs.groupby(["ik14"]).first().reset_index() unique = ( - unique.groupby(["inchikey", "mass", "formula"]).first().reset_index() - ) - unique["size"] = ( freqs.groupby(["inchikey", "mass", "formula"]) - .size() + .agg(smiles=("smiles", "first"), size=("smiles", "count")) + .reset_index() + ) + unique["ik14"] = unique["inchikey"].str[:14] + + unique = ( + unique.sort_values("size", ascending=False, kind="stable") + .groupby("ik14", sort=False) + .agg( + size=("size", "sum"), + **{ + col: (col, "first") + for col in unique.columns + if col not in ("ik14", "size") + }, + ) .reset_index(drop=True) ) - unique = unique.sort_values( - "size", ascending=False, kind="stable" - ).reset_index(drop=True) - unique = unique.drop(columns=["ik14"]) + + unique = ( + unique.sort_values("size", ascending=False, kind="stable").reset_index( + drop=True + ) + )[["inchikey", "mass", "formula", "smiles", "size"]] write_to_csv_file(output_file, unique) # TODO: The following approach will result in multiple lines for each repeated smile