18
18
19
19
import logging
20
20
import click
21
+ from functools import lru_cache
21
22
22
23
logging .basicConfig (level = logging .ERROR )
23
24
VARTYPES = [
@@ -105,7 +106,11 @@ def download(self, remotefile, dir=None):
105
106
dir = self .CIMIS_DOWNLOAD_DIR
106
107
localfile = os .path .join (dir , str .split (remotefile , "/" )[- 1 ])
107
108
self .ensure_dir (os .path .dirname (localfile ))
108
- self .sftp .get (remotefile , localfile )
109
+ try :
110
+ self .sftp .get (remotefile , localfile )
111
+ except Exception as ex :
112
+ logging .error (f"Error downloading { remotefile } : { ex } " )
113
+ raise ex
109
114
return localfile
110
115
111
116
def download_zipped (self , year , hourly = True ):
@@ -152,6 +157,7 @@ def download_unzipped(self, year, stations, hourly=True):
152
157
except Exception as ex :
153
158
logging .warning (f"Error downloading { interval } station { station } : { ex } " )
154
159
160
+ @lru_cache (maxsize = 128 )
155
161
def get_columns_for_year (self , y , hourly = True ):
156
162
if y >= 2014 :
157
163
units_file = self .download ("/pub2/readme-ftp-Revised5units.txt" )
@@ -234,6 +240,7 @@ def download_current_month(self, stations, hourly=True):
234
240
except Exception as ex :
235
241
logging .warning (f"Error downloading station { station } : { ex } " )
236
242
243
+ @lru_cache (maxsize = 128 )
237
244
def get_stations_info (
238
245
self ,
239
246
file = "/pub2/CIMIS Stations List (January20).xlsx" ,
@@ -443,7 +450,14 @@ def cache_to_pkl(self, dfstations):
443
450
print (dfstations [dfstations ["Station Number" ].isin (failed_stations )])
444
451
445
452
446
- def download_all_data (hourly = True ):
453
+ def download_all_data (hourly = True , partial = False ):
454
+ """
455
+ Download all CIMIS data from the FTP site. Each year is a separate file for hourly data
456
+ Each month is a separate file for hourly data
457
+
458
+ :param hourly: download hourly data (default is True)
459
+ :param partial: download only partial data (default is False) (only downloads last couple of years)
460
+ """
447
461
password = os .environ .get ("CIMIS_PASSWORD" , default = "xxx" )
448
462
cx = CIMIS (password = password )
449
463
if hourly :
@@ -456,9 +470,11 @@ def download_all_data(hourly=True):
456
470
dfcat .to_csv ("cimis_stations.csv" , index = "Station Number" )
457
471
current_year = pd .to_datetime ("today" ).year
458
472
active_stations = list (dfcat [dfcat ["Status" ] == "Active" ]["Station Number" ])
459
- for year in range (min_year , current_year - 2 ):
460
- print (f"Downloading zipped { interval } data for year" , year )
461
- cx .download_zipped (year , hourly )
473
+
474
+ if not partial :
475
+ for year in range (min_year , current_year - 2 ):
476
+ print (f"Downloading zipped { interval } data for year" , year )
477
+ cx .download_zipped (year , hourly )
462
478
463
479
for year in range (current_year - 2 , current_year ):
464
480
print (f"Downloading unzipped { interval } data for year" , year )
@@ -472,7 +488,7 @@ def download_all_data(hourly=True):
472
488
dfs = cx .load_station (station , True , hourly )
473
489
dfs .to_csv (f"cimis_{ interval } _{ station :03d} .csv" , index = "Date" )
474
490
except Exception as e :
475
- logging .error (f"Error: { e } " )
491
+ logging .error (f"Error loading station { station } : { e } " )
476
492
continue
477
493
478
494
@@ -492,7 +508,11 @@ def merge_with_existing(existing_dir, new_dir, hourly=True):
492
508
dfn = pd .read_csv (file , index_col = 0 , parse_dates = True )
493
509
if os .path .exists (existing_file ):
494
510
dfe = pd .read_csv (existing_file , index_col = 0 , parse_dates = True )
495
- dfe .combine_first (dfn ).to_csv (existing_file )
511
+ # Combine the two DataFrames and remove duplicates
512
+ combined = pd .concat ([dfe , dfn ]).drop_duplicates (
513
+ keep = "last"
514
+ ) # Keeps the last occurrence
515
+ combined .to_csv (existing_file )
496
516
else :
497
517
logging .warning (f"File { existing_file } does not exist so writing new file" )
498
518
dfn .to_csv (existing_file )
@@ -503,7 +523,19 @@ def merge_with_existing(existing_dir, new_dir, hourly=True):
503
523
"--hourly" , type = bool , default = True , help = "Download hourly data (default is True)"
504
524
)
505
525
@click .option ("--existing_dir" , default = None , help = "Directory to merge new data into" )
506
- def main (hourly , existing_dir = None ):
526
+ @click .option (
527
+ "--download" ,
528
+ type = bool ,
529
+ default = True ,
530
+ help = "Download data (default is True)" ,
531
+ )
532
+ @click .option (
533
+ "--partial" ,
534
+ is_flag = True ,
535
+ default = False ,
536
+ help = "Set partial download to True if provided (default is False)" ,
537
+ )
538
+ def main (hourly , existing_dir = None , download = True , partial = False ):
507
539
"""
508
540
Download CIMIS data
509
541
--hourly: download hourly data (default is True)
@@ -512,6 +544,9 @@ def main(hourly, existing_dir=None):
512
544
environment variable CIMIS_PASSWORD must be set to the password for the CIMIS FTP site
513
545
514
546
"""
515
- download_all_data (hourly = hourly )
547
+ if partial :
548
+ partial_only = True
549
+ if download :
550
+ download_all_data (hourly = hourly , partial = partial )
516
551
if existing_dir is not None :
517
552
merge_with_existing (existing_dir , "." , hourly = hourly )
0 commit comments