Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions arccnet/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from collections import ChainMap, defaultdict
from collections.abc import Mapping

import arccnet
from arccnet import load_config
from arccnet.models.cutouts import config as config_module
from arccnet.models.cutouts.inference import predict
Expand Down Expand Up @@ -239,6 +240,7 @@ def combine_args(args=None):

def main(args=None):
combined = combine_args(args)
arccnet.config = combined
command = combined.get("command")
if command == "catalog":
catalog_commands(combined)
Expand All @@ -248,3 +250,8 @@ def main(args=None):
inference_commands(combined)
else:
raise ValueError(f"Unknown command: {command}")


if __name__ == "__main__":
main()
sys.exit(0)
41 changes: 26 additions & 15 deletions arccnet/data_generation/data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,24 @@ def search(self, batch_frequency: int, merge_tolerance: timedelta) -> list[Query
meta_datetime = (
meta[["datetime"]].drop_duplicates().dropna().sort_values("datetime").reset_index(drop=True)
) # adding sorting here... is this going to mess something up?

if len(meta_datetime) == 0:
results.append(
Query(
QTable(
names=[
"target_time",
"datetime",
"start_time",
"end_time",
"record",
"filename",
"url",
"record_T_REC",
]
)
)
)
continue
# generate a mapping between target_time to datetime with the specified tolerance.
merged_time = pd.merge_asof(
left=pd_query[["target_time"]],
Expand All @@ -242,18 +259,9 @@ def search(self, batch_frequency: int, merge_tolerance: timedelta) -> list[Query
# which there may be multiple for cutouts at the same full-disk time, and join
matched_rows = meta[meta["datetime"].isin(merged_time["datetime"])]

# -- Bit hacky to stop H(T)ARPNUM becoming a float
# I think Shane may have found a better way to deal with this?
# Convert int64 columns to Int64
int64_columns = matched_rows.select_dtypes(include=["int64"]).columns
# Create a new DataFrame with Int64 data types
new_df = matched_rows.copy()
for col in int64_columns:
new_df[col] = matched_rows[col].astype("Int64")

# merged_time <- this is the times that match between the query and output
# new_df / matched_rows are the rows in the output at the same time as the query
merged_df = pd.merge(merged_time, new_df, on="datetime", how="left")
# matched_rows are the rows in the output at the same time as the query
merged_df = pd.merge(merged_time, matched_rows, on="datetime", how="left")
# I hope this isn't nonsense, and keeps the `urls` as a masked column
# how does this work with sharps/smarps where same datetime for multiple rows

Expand Down Expand Up @@ -305,9 +313,12 @@ def download(self, query_list: list[Query], path: Path = None, overwrite: bool =

# !TODO a way of retrying missing would be good, but JSOC URLs are temporary.
if new_query is not None:
downloaded_files = self._download(
data_list=new_query[~new_query["url"].mask]["url"].data.data, path=path, overwrite=overwrite
)
if len(new_query) > 0:
downloaded_files = self._download(
data_list=new_query[~new_query.mask["url"]]["url"].data, path=path, overwrite=overwrite
)
else:
downloaded_files = []
results = self._match(results, downloaded_files) # should return a results object.
else:
raise NotImplementedError("new_query is none.")
Expand Down
Loading
Loading