Skip to content

Commit

Permalink
Switched USGS to json and refactored quarantine
Browse files Browse the repository at this point in the history
  • Loading branch information
water-e authored and water-e committed Sep 27, 2024
1 parent f77d0d7 commit ca4ae7f
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 10 deletions.
11 changes: 5 additions & 6 deletions dms_datastore/download_nwis.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ def download_station(
yearname = (
f"{start.year}_{endfile}" # if start.year != end.year else f"{start.year}"
)
outfname = f"usgs_{station}_{agency_id}_{paramname}_{yearname}.rdb"
outfname = f"usgs_{station}_{agency_id}_{paramname}_{yearname}.json"
# Water quality data; does not work in command line.
if str(paramname).startswith("qual"):
outfname = f"usgs_{station}_{agency_id}_{paramname}_{param}_{yearname}.rdb"
outfname = f"usgs_{station}_{agency_id}_{paramname}_{param}_{yearname}.json"
outfname = outfname.lower()
path = os.path.join(dest_dir, outfname)
if os.path.exists(path) and not overwrite:
Expand All @@ -98,20 +98,20 @@ def download_station(
stime = start.strftime("%Y-%m-%d")
etime = end.strftime("%Y-%m-%d")
found = False
station_query_base = f"http://nwis.waterservices.usgs.gov/nwis/iv/?sites={agency_id}&startDT={stime}&endDT={etime}&format=rdb"
station_query_base = f"http://nwis.waterservices.usgs.gov/nwis/iv/?sites={agency_id}&startDT={stime}&endDT={etime}&format=json"
if param:
station_query = station_query_base + f"&variable={int(param):05}"
# station_query = station_query_base % (station,stime,etime,param)
else:
station_query = station_query_base
# Water quality data; does not work in command line.
if str(paramname).startswith("qual"):
station_query_base = f"https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no={agency_id}&begin_date={stime}&end_date={etime}&format=serial_rdb"
station_query_base = f"https://nwis.waterdata.usgs.gov/nwis/qwdata?site_no={agency_id}&begin_date={stime}&end_date={etime}&format=json"
if param:
station_query = station_query_base + f"&parameter_cd={int(param):05}"
else:
station_query = station_query_base
logger.info(station_query)
logger.info(f"USGS Query for ({station},{paramname}): {station_query}")
try:
if sys.version_info[0] == 2:
raise ValueError("Python 2 no longer supported")
Expand Down Expand Up @@ -142,7 +142,6 @@ def nwis_download(stations, dest_dir, start, end=None, param=None, overwrite=Fal
These dates are passed on to CDEC ... actual return dates can be
slightly different
"""
logger.info(stations)
if end is None:
end = dt.datetime.now()
endfile = 9999
Expand Down
21 changes: 18 additions & 3 deletions dms_datastore/populate_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@
"cdec": cdec_download,
}

def _quarantine_file(fname,quarantine_dir = "quarantine"):
if not os.path.exists(quarantine_dir):
os.makedirs("quarantine")
shutil.copy(fname,"quarantine")


def revise_filename_syears(pat, force=True, outfile="rename.txt"):
"""Revise start year of files matching pat to the first year of valid data
Expand Down Expand Up @@ -135,6 +140,7 @@ def revise_filename_syear_eyear(pat, force=True, outfile="rename.txt"):
Name of file to log failures
"""
return
if SAFEGUARD:
raise NotImplementedError("populate repo functions not ready to use")
logger.info(f"Beginning revise_filename_syear_eyear for pattern: {pat}")
Expand All @@ -157,9 +163,7 @@ def revise_filename_syear_eyear(pat, force=True, outfile="rename.txt"):
bad.append(fname + " (small,deleted)")
logger.info(f"Small file {fname} caused read exception. Deleted during rename")
else:
if not os.path.exists("quarantine"):
os.makedirs("quarantine")
shutil.copy(fname,"quarantine")
quarantine_file(fname,"quarantine")
bad.append(fname + " (not small, not deleted)")
logger.info(f"non-small file {fname} caused read exception. Not deleted during rename")
continue
Expand Down Expand Up @@ -397,16 +401,27 @@ def populate(dest, all_agencies=None, varlist=None, partial_update=False):
# It looks like big files, and this is possible, but many will be truncated because of limited
# instrument lifetimes ... so 1980-2019 will come out as 1984-2007 or something like that.
if agency == "dwr_des":

for var in varlist:
logger.info(
f"Calling populate_repo with agency {agency} variable: {var}"
)
if not partial_update:
# Pulls in data in two 20 year blocks, which helps with query length limits
# Pulls in data in two 20 year blocks, which helps with query length limits
populate_repo(
agency,
var,
dest,
pd.Timestamp(1980, 1, 1),
pd.Timestamp(1999, 12, 31, 23, 59),
ignore_existing=ignore_existing,
)
populate_repo(
agency,
var,
dest,
pd.Timestamp(2000, 1, 1),
pd.Timestamp(2019, 12, 31, 23, 59),
ignore_existing=ignore_existing,
)
Expand Down
4 changes: 4 additions & 0 deletions dms_datastore/quarantine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import os
import os.path
import shutil

12 changes: 11 additions & 1 deletion dms_datastore/usgs_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
from dms_datastore.write_ts import *
from dms_datastore.filename import interpret_fname,meta_to_filename

def _quarantine_file(fname,quarantine_dir = "quarantine"):
if not os.path.exists(quarantine_dir):
os.makedirs("quarantine")
shutil.copy(fname,"quarantine")


def usgs_scan_series(fname):
""" Scans file and returns a list of time series, parameters and description for each series in the file
Expand Down Expand Up @@ -99,7 +105,11 @@ def usgs_multivariate(pat,outfile):
if ts.shape[1] != 1:
message = f"usgs_meta: file {fname} Columns {ts.columns}"
logger.debug(message)
series = usgs_scan_series(fname) # Extract list of series in file
try:
series = usgs_scan_series(fname) # Extract list of series in file
except:
_quarantine_file(fname)
logger.debug(f"Could not scan USGS file for variables: {fname}")
for s in series:
(ats_id,aparam,adescr) = s
out.write(message+"\n")
Expand Down

0 comments on commit ca4ae7f

Please sign in to comment.