forked from pfuhe1/cpdn_extract_scripts
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwah_extract_functions.py
executable file
·834 lines (741 loc) · 29.8 KB
/
wah_extract_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
###############################################################################
# Program : wah_extract_functions.py
# Sihan Li: Modified to include vertical integrated variables & snapshots
# Author : Peter Uhe, based on original scripts by Neil Massey
# Date : 09/09/16
# Purpose : Functions to extract w@h zip files and extract the
# requested fields into separate netCDF files
###############################################################################
import sys, os, ast
import urllib, tempfile, zipfile, shutil
import numpy, numpy.ma
import math
import gzip
import glob
#from netcdf_file import netcdf_file # Note netcdf_file.py doesn't allow appending
#from scipy.io.netcdf import netcdf_file # Note Append option requires at least scipy-0.15.0
from netCDF4 import Dataset as netcdf_file # Note Can use netCDF4 for append option instead
from conv_rot_grid import rot2glob, glob2rot
import string
from datetime import datetime,timedelta
import numpy as np
#############################################################################
def um_datecode(syear,smon,zip_num):
months=['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
month_str=months[(smon+zip_num-1)%12]
year=syear+(smon+zip_num-1)/12
decade=(year-1800)/10
year_code=str(year%10)
if decade<10: dec_code=str(decade) #decade is 0 to 9
else: dec_code=chr(decade-10+ord('a')) #decade needs to be converted to char
return dec_code+year_code+month_str
##############################################################################
#function to timesamp given um timestamp
# assumes code e.g. l5apr -> 2015-04
def um_to_timestamp(um_datecode):
mon_map={'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}
dec_code=um_datecode[0]
try:dec=int(dec_code) #decade is 0 to 9
except: dec=ord(dec_code)-ord('a')+10 #decade is a-z
year=int(um_datecode[1])
try:
mon=mon_map[um_datecode[2:]]
except:
# Assume a datecode ending in '10' is a yearly file
if um_datecode[3:]=='10':
mon='yr'
return str(1800+dec*10+year)+'-'+str(mon).zfill(2)
###############################################################################
def get_coord_string(coord_list):
coord_string = ""
for p in range(0,4):
V = str(coord_list[p])
if (p == 1 or p == 3):
if V[0] == "-":
coord_string += "S" + V[1:] + "_"
else:
coord_string += "N" + V[0:] + "_"
else:
if V[0] == "-":
coord_string += "W" + V[1:] + "_"
else:
coord_string += "E" + V[0:] + "_"
return coord_string[:-1]
# Take time freq of data in hrs and output human readable
def time_freq_friendly(time_freq):
if int(time_freq)==24:
return 'daily'
elif int(time_freq) == 720:
return 'monthly'
else:
return str(time_freq)+'hrly'
###############################################################################
def get_output_field_name(field):
fname = field[1]
if field[2] != []:
coord_string = get_coord_string(field[2])
fname += "_" + coord_string
if field[3] != 'all':
fname += "_" + field[3]
return fname
def get_output_field_name2(field):
time_freq=time_freq_friendly(field[6])
cell_method=field[7]
fname = 'item'+str(field[1])
fname += '_'+time_freq
# if cell_method !='mean':
fname += '_' + cell_method
if field[2] != []:
coord_string = get_coord_string(field[2])
fname += "_" + coord_string
if field[3] != 'all':
fname += "_" + field[3]
return fname
# Just have item number time freq and meaning (and any spatial meaning if necessary)
def get_output_field_name3(field):
time_freq=time_freq_friendly(field[6])
cell_method=field[7]
fname = 'item'+str(field[1])
fname += '_'+time_freq
fname += '_' + cell_method
if field[3] != 'all':
fname += "_" + field[3]
return fname
###############################################################################
def get_filename(taskpath, field,output_dir,zipstart,zipend,structure='std',zip_freq='month'):
# Different components used in file name and path
boinc=taskpath.split('/')[-1]
# When splitting up the file name by underscore
# hadcm3 apps have one less component (e.g. compared to wah2_eu50)
if boinc[:6] in ['hadcm3','hadam4']:
stream_map={'ma':'atmos','ga':'atmos','ka':'atmos','ko':'ocean'}
umid=boinc.split('_')[1]
datecode=boinc.split('_')[2]
else:
stream_map={'ma':'atmos','ga':'region','ka':'atmos','ko':'ocean'}
umid=boinc.split('_')[2]
datecode=boinc.split('_')[3]
# Get start year and month (old filenames don't contain start month so assume 12)
syear=int(datecode[:4])
try:
smon=int(datecode[4:6])
except:
smon = 12
# Work out start and end date of output file
datestart=datetime(int(syear),int(smon),1)
if zip_freq=='month':
date1 = add_months(datestart,zipstart-1)
date2 = add_months(datestart,zipend-1)
elif zip_freq=='year':
date1=datestart.replace(year=int(syear)+zipstart-1)
datetmp=add_months(date1,11) # First add 11 months for yearly output
date2=datetmp.replace(year=datetmp.year+zipend-zipstart) #Add number of years
else:
raise Exception('Error, zip freq must be month or year')
date_range = str(date1.year)+'-'+str(date1.month).zfill(2)+'_'+str(date2.year)+'-'+str(date2.month).zfill(2)
# Other components
time_freq=time_freq_friendly(field[6])
cell_method=field[7]
varname = get_output_field_name3(field)
model = stream_map[field[0][:2]]
if field[2]!=[]:
coord_string = '_'+ get_coord_string(field[2])
else:
coord_string = ''
# construct file name
if structure=='std':
fname = os.path.join(output_dir,model+coord_string,varname,varname+'_'+umid+'_'+date_range+'.nc')
elif structure=='startdate-dir':
fname = os.path.join(output_dir,model+coord_string,varname,datecode,varname+'_'+umid+'_'+date_range+'.nc')
return fname
###############################################################################
def check_files_exist(taskpath, field_list,output_dir,zipstart,zipend,structure,zip_freq):
# Loop over fields/ variables
for field in field_list:
# If the file doesn't exist return false
fname=get_filename(taskpath,field,output_dir,zipstart,zipend,structure=structure,zip_freq=zip_freq)
if not os.path.exists(fname):
return False
# We got to the end, all files must exist
return True
###############################################################################
def get_idx(point, array):
D = array[1] - array[0]
S = array[0]
I = int(0.5+(point - S) / D)
return I
###############################################################################
def subset_dimensions(in_dimensions, field, plon, plat):
fs = field[2]
# fs has elements [lon_l, lat_l, lon_r, lat_r]
out_dims = []
subset_dims = []
# check whether we need to translate from global to rotated grid
if plon == 0.0 and plat == 90.0:
if fs != []:
rlon_s = fs[0]
rlon_e = fs[2]
rlat_s = fs[1]
rlat_e = fs[3]
else:
if fs != []:
rlon_s, rlat_s = glob2rot(fs[0], fs[1], plon, plat)
rlon_e, rlat_e = glob2rot(fs[2], fs[3], plon, plat)
# get the lon / lat indices
remap_data = False
for d in in_dimensions:
if "longitude" in d[0]:
if fs == []:
lon_idx_s = 0
lon_idx_e = d[1].shape[0]
else:
lon_idx_s = get_idx(rlon_s, d[1])
lon_idx_e = get_idx(rlon_e, d[1]) + 1
print 'lon i',lon_idx_s,lon_idx_e
if lon_idx_s < 0:
remap_data = True
if lon_idx_e > d[1].shape[0]:
remap_data = True
# if the data needs remapping
if remap_data:
d_len = d[1].shape[0] # get the length
d_len_d2 = d_len / 2 # get the length div 2
lon_idx_s += d_len_d2 # remap the start index
lon_idx_e += d_len_d2 # remap the end index
d_new = numpy.zeros([d_len], 'f') # create a new dimension variable
d_new[0:d_len_d2] = d[1][d_len_d2:d_len] # copy the right hand half to the left hand
d_new[d_len_d2:d_len] = d[1][0:d_len_d2] # copy the left hand half to the right hand
d[1] = d_new # reassign
# if processing data then do not append dimension to output dimensions
if field[3] == "all":
out_dims.append([d[0], d[1][lon_idx_s:lon_idx_e]])
subset_dims.append([d[0], d[1][lon_idx_s:lon_idx_e]])
elif "latitude" in d[0]:
if fs == []:
lat_idx_s = 0
lat_idx_e = d[1].shape[0]
else:
lat_idx_s = get_idx(rlat_s, d[1])
lat_idx_e = get_idx(rlat_e, d[1]) +1
if lat_idx_e < lat_idx_s:
temp = lat_idx_e
lat_idx_e = lat_idx_s
lat_idx_s = temp
if lat_idx_s < 0:
lat_idx_s = 0
if lat_idx_e > d[1].shape[0]:
lat_idx_e = d[1].shape[0]
# if processing data then do not append dimension
if field[3] == "all":
out_dims.append([d[0], d[1][lat_idx_s:lat_idx_e]])
subset_dims.append([d[0], d[1][lat_idx_s:lat_idx_e]])
else:
out_dims.append([d[0], d[1]])
subset_dims.append([d[0], d[1]])
# if processing data then need a "pt" dimension
if field[3] != "all":
out_dims.append(["pt", numpy.array([1.0],dtype=numpy.float32)])
return out_dims, subset_dims, [lon_idx_s, lat_idx_s, lon_idx_e, lat_idx_e], remap_data
###############################################################################
def calc_grid_box_area(lon, lat, lon_d, lat_d, plon, plat):
# calculate the area of a grid box. This is only an approximation as it
# assumes the grid box is square, rather than the polygonal shape the
# rotated boxes actually are.
lon1_rad, lat1_rad = rot2glob(lon-lon_d,lat+lat_d, plon, plat)
lon2_rad, lat2_rad = rot2glob(lon+lon_d,lat-lat_d, plon, plat)
area = math.fabs(lon2_rad-lon1_rad) *\
math.fabs(math.sin(lat2_rad) - math.sin(lat1_rad))
return area
###############################################################################
def aamean(var_ma_data, plon, plat, subset_dims):
# calculate the weights
if plon == 0.0 and plat == 90.0:
# just a regular grid so the weights are simply the cosine of the latitude
for dim in subset_dims:
if "latitude" in dim[0]:
weights = numpy.cos(numpy.radians(dim[1]))
lon_means = numpy.ma.mean(var_ma_data, axis=3)
means = numpy.ma.average(lon_means, weights=weights, axis=2)
else:
# identify the latitude and longitude dimensions and get the coordinates
for dim in subset_dims:
if "latitude" in dim[0]:
lats = dim[1]
if "longitude" in dim[0]:
lons = dim[1]
# calculate the weights
weights = numpy.zeros([lats.shape[0], lons.shape[0]], 'f')
lon_d = 0.5 * (lons[1] - lons[0])
lat_d = 0.5 * (lats[1] - lats[0])
for j in range(0, lats.shape[0]):
for i in range(0, lons.shape[0]):
# weights are area grid box
weights[j,i] = calc_grid_box_area(lons[i], lats[j], lon_d, lat_d, plon, plat)
# multiply through each timestep / level by the weights
weighted_data = var_ma_data[:,:] * weights
# calculate the means - sums first
means = numpy.ma.sum(numpy.ma.sum(weighted_data, axis=3), axis=2)
# calculate the means
for t in range(0, var_ma_data.shape[0]):
for z in range(0, var_ma_data.shape[1]):
# only want to use the sum of the weights of the non missing values
weights_masked = numpy.ma.masked_where(var_ma_data[t,z].mask,weights)
# now calculate the sum of the weights
sum_weights = weights_masked.sum()
# Normalise the means by the sum of weights
means[t,z] = means[t,z] / sum_weights
return means
###############################################################################
def process_data(var_in_data, process, mv, plon, plat, subset_dims, valid_min, valid_max):
# mask array before processing
# mask is either the missing value or where the data is outside the
# valid range
var_ma_data = numpy.ma.masked_where((var_in_data == mv) | \
(var_in_data < valid_min) | \
(var_in_data > valid_max), var_in_data)
# do the various processes based on what has been requested
if process == "min":
out_data = numpy.ma.min(numpy.ma.min(var_ma_data, axis=2), axis=2)
elif process == "max":
out_data = numpy.ma.max(numpy.ma.max(var_ma_data, axis=2), axis=2)
elif process == "mean":
out_data = aamean(var_ma_data, plon, plat, subset_dims)
elif process == "sum":
out_data = numpy.ma.sum(numpy.ma.sum(var_ma_data, axis=2), axis=2)
if len(out_data.shape) != len(var_in_data.shape)-1:
out_data = out_data.reshape(out_data.shape[0], out_data.shape[1], 1)
return out_data
###############################################################################
# Process data: Assumes var_in_data is a masked array
def process_data2(var_in_data, process,plon, plat, subset_dims):
# do the various processes based on what has been requested
if process == "min":
out_data = numpy.ma.min(numpy.ma.min(var_ma_data, axis=2), axis=2)
elif process == "max":
out_data = numpy.ma.max(numpy.ma.max(var_ma_data, axis=2), axis=2)
elif process == "mean":
out_data = aamean(var_ma_data, plon, plat, subset_dims)
elif process == "sum":
out_data = numpy.ma.sum(numpy.ma.sum(var_ma_data, axis=2), axis=2)
if len(out_data.shape) != len(var_in_data.shape)-1:
out_data = out_data.reshape(out_data.shape[0], out_data.shape[1], 1)
return out_data
###############################################################################
def get_missing_value(nc_in_file):
try:
mv = getattr(nc_in_file,"missing_value")
except:
mv = getattr(nc_in_file,"_FillValue")
return mv
###############################################################################
def get_rotated_pole(nc_in_file,nc_in_var):
# get the rotated pole longitude / latitude (for calculating weights)
try:
grid_map_name = getattr(nc_in_var,"grid_mapping")
grid_map_var = nc_in_file.variables[grid_map_name]
plon = getattr(grid_map_var,"grid_north_pole_longitude")
plat = getattr(grid_map_var,"grid_north_pole_latitude")
except:
plon = 0.0
plat = 90.0
return plon, plat
##############################################################################
# Function select_var
# searches for variable name in netcdf file that matches stash code, meaning period and time cell method
# Stash code is stash_section*1000 + stash_item
# meaning period is in units of hours (e.g. 24 is daily data, 720 is monthly data)
# cell method is e.g. 'mean', 'min', 'max'
#
def select_var(nc_file,stash_code,meaning_period,cell_method):
for varname,var in nc_file.variables.iteritems():
# look at fields matching field_name e.g. field16, field16_1 etc.
try:
var_stash_code=int(var.stash_section)*1000+int(var.stash_item)
except:
# Not a stash item
continue
# if varname[:len(field_name)]==field_name:
if int(stash_code)==var_stash_code:
# Get cell method
tmp=var.cell_method
dim,var_cell_method=tmp.split()
if not dim=='time:':
raise Exception('Error, expecting cell method for time, got: '+dim)
# Get meaning period from time dimension
time_dim=nc_file.variables[var.dimensions[0]]
if 'meaning_period' in time_dim.ncattrs():
var_meaning_period,units=time_dim.meaning_period.split()
if not units=='hours':
raise Exception('Error, expecting meaning period in hours, got: '+units)
else: # If there is no meaning period, just use expected value
var_meaning_period = meaning_period
if int(meaning_period)==int(var_meaning_period) and cell_method==var_cell_method:
return varname
else:
continue
# Haven't found correct variable:
raise Exception('Error, could not find variable: stash code='+str(stash_code)+' with meaning period '+str(meaning_period)+' and cell_method '+cell_method)
##############################################################################
# Function select_vars_stash
# searches for variable name in netcdf file that matches stash code, meaning period and time cell method
# Stash code is stash_section*1000 + stash_item
# meaning period is in units of hours (e.g. 24 is daily data, 720 is monthly data)
# cell method is e.g. 'mean', 'min', 'max'
#
def select_vars_stash(nc_file,stash_code,meaning_period,cell_method,vert_lev):
varnames=[]
if vert_lev=='':
vert_lev='any'
for varname,var in nc_file.variables.iteritems():
# look at fields matching field_name e.g. field16, field16_1 etc.
try:
var_stash_code=int(var.stash_section)*1000+int(var.stash_item)
except:
# Not a stash item
continue
# Check for matching stash code
if int(stash_code)==var_stash_code:
# Get cell method
if 'cell_method' in var.ncattrs():
tmp = var.cell_method.split()
dim = tmp[0]
var_cell_method=tmp[1]
# Debug statement
#print var.cell_method
if not dim=='time:':
raise Exception('Error, expecting cell method for time, got: '+dim)
else: # no cell method
var_cell_method='inst'
# Get meaning period from time dimension
dims=var.dimensions
time_dim=nc_file.variables[dims[0]]
if 'meaning_period' in time_dim.ncattrs():
var_meaning_period,units=time_dim.meaning_period.split()
if not units=='hours':
raise Exception('Error, expecting meaning period in hours, got: '+units)
else: # If there is no meaning period to check, just set expected value
var_meaning_period = meaning_period
# Check if this variable matches the required meaning period and cell method
if int(meaning_period)==int(var_meaning_period) and cell_method==var_cell_method:
if vert_lev=='any': # Any level
varnames.append(varname)
elif vert_lev== dims[1]: #field[8] matches vertical level
varnames.append(varname)
else:
continue
# Haven't found correct variable:
if varnames==[]:
raise Exception('Error, could not find variable: stash code='+str(stash_code)+' with meaning period '+str(meaning_period)+' and cell_method '+cell_method+' and vert_lev '+vert_lev)
else:
return varnames
# Function select_vars_field
# searches for variable name in netcdf file that matches field code, meaning period and time cell method
# meaning period is in units of hours (e.g. 24 is daily data, 720 is monthly data)
# cell method is e.g. 'mean', 'min', 'max'
#
def select_vars_field(nc_file,field_name,meaning_period,cell_method,vert_lev):
varnames=[]
if vert_lev=='':
vert_lev='any'
for varname,var in nc_file.variables.iteritems():
if varname[:len(field_name)]==field_name:
# Get cell method
tmp=var.cell_method
dim,var_cell_method=tmp.split()
if not dim=='time:':
raise Exception('Error, expecting cell method for time, got: '+dim)
# Get meaning period from time dimension
time_dim=nc_file.variables[var.dimensions[0]]
tmp=time_dim.meaning_period
var_meaning_period,units=tmp.split()
if not units=='hours':
raise Exception('Error, expecting meaning period in hours, got: '+units)
# Check if this variable matches the required meaning period and cell method
if int(meaning_period)==int(var_meaning_period) and cell_method==var_cell_method:
if vert_lev=='any': # Any level
varnames.append(varname)
elif vert_lev== dims[1]: # check vert_lev matches vertical level in file
varnames.append(varname)
else:
continue
# Haven't found correct variable:
if varnames==[]:
raise Exception('Error, could not find variable: field name='+field_name+' with meaning period '+str(meaning_period)+' and cell_method '+cell_method+' and vert_lev '+vert_lev)
else:
return varnames
# main function to read in raw netcdf files and write to single variable files
def process_netcdf(in_ncf,out_name,field,append,zip_freq='month'):
try:
item_code = field[1]
process = field[3]
v_min = field[4]
v_max = field[5]
meaning_period = field[6]
cell_method = field[7]
vert_lev = field[8]
in_ncf_end=os.path.basename(in_ncf)
umid,datestamp=in_ncf_end[:-3].split(field[0])
# open as netCDF to a temporary file
nc_in_file = netcdf_file(in_ncf,'r')
# get the variable from the input (choose first variable from possible variables)
in_vars = select_vars_stash(nc_in_file,item_code,meaning_period,cell_method,vert_lev)
in_vars.sort()
print in_vars,
if len(in_vars)>1:
print 'Warning, multiple variables found matching request, choosing first variable',in_ncf_end,
in_var=in_vars[0]
nc_in_var = nc_in_file.variables[in_var]
in_dimensions = []
# Get the dimensions from input netcdf
for d in nc_in_var.dimensions:
# get the input dimension and the data
dim_in_var = nc_in_file.variables[d]
dim_in_data = dim_in_var[:]
in_dimensions.append([d, dim_in_data])
# get the rotated pole definition
plon, plat = get_rotated_pole(nc_in_file,nc_in_var)
# subset the dimensions to create the out_dimensions
out_dims, subset_dims, lon_lat_idxs, remap_data = subset_dimensions(in_dimensions, field, plon, plat)
# if the longitude and latitude indexes are < 0 then we need to remap the data so that 0deg is
# in the middle of the field, not at the beginning
if remap_data:
in_data = nc_in_var[:,:,lon_lat_idxs[1]:lon_lat_idxs[3],:] # get the input data - can subset latitude early
new_data = numpy.ma.zeros(in_data.shape, 'f') # create a new store
d_len = in_data.shape[3] # get the longitude length
d_len_d2 = d_len / 2 # lon length div 2
new_data[:,:,:,0:d_len_d2] = in_data[:,:,:,d_len_d2:d_len] # copy right hand half to left hand
new_data[:,:,:,d_len_d2:d_len] = in_data[:,:,:,0:d_len_d2] # copy left hand half to right hand
var_out_data = new_data[:,:,:,lon_lat_idxs[0]:lon_lat_idxs[2]] # get the subset data
else:
var_out_data = nc_in_var[:,:,lon_lat_idxs[1]:lon_lat_idxs[3], lon_lat_idxs[0]:lon_lat_idxs[2]]
# Check we have the correct number of time values:
# Note this assumes monthly output data
ntime=nc_in_var.shape[0]
if zip_freq=='month':
factor = 720. # hours in a month
elif zip_freq=='year':
factor = 8640. # hours in a year
if ntime != factor / (meaning_period*1.0):
raise Exception('Data has wrong number of times ('+str(ntime)+' for meaning period: '+str(meaning_period))
# Check data is within range
mv = get_missing_value(nc_in_var)
if not numpy.all(numpy.isfinite(var_out_data)) or var_out_data.min()<v_min or var_out_data.max()>v_max:
raise Exception('Data outside valid range ('+str(v_min)+' - '+str(v_max)+') in netcdf file: '+str(var_out_data.min())+','+str(var_out_data.max()))
# if the data is going to be processed then do the processing here
if process != "all":
var_out_data = process_data2(var_out_data, process, plon, plat, subset_dims)
# Sort out output file:
#
out_var = get_output_field_name3(field)
# create the output netCDF file if not appending
if append:
if not os.path.exists(out_name):
raise Exception('Error, expecting to append to file but no file exists')
else:
nc_out_file = netcdf_file(out_name, "a")
else:
# check whether it exists
if os.path.exists(out_name):
# remove existing file to make sure this is clean
os.remove(out_name)
# Set up new file
nc_out_file = netcdf_file(out_name, "w",format='NETCDF3_CLASSIC')
nc_out_file.setncatts(nc_in_file.__dict__)
# create the dimensions
for d in out_dims:
# create the output dimension and variable
if d[0][:4]!='time':
dlen=d[1].shape[0]
else:
dlen=0 # Time has unlimited dimension
nc_out_file.createDimension(d[0],dlen)
dim_out_var = nc_out_file.createVariable(d[0], d[1].dtype, (d[0],))
# assign the output variable data and attributes from the input
if d[0] in nc_in_file.variables.keys():
dim_in_var = nc_in_file.variables[d[0]]
dim_out_var.setncatts(dim_in_var.__dict__)
elif d[0] == "pt":
# if it's the "pt" dimension then create an attribute indicating the domain of the
# mean-ed / max-ed / min-ed variable
dom_str = ""
if field[2] == []:
dom_str = "global "
else:
for i in range(0, 4):
dom_str += str(field[2][i]) + ", "
dim_out_var.domain= dom_str[:-2]
dim_out_var[:] = d[1][:]
# create the variable
out_dim_names = [d[0] for d in out_dims]
nc_out_var = nc_out_file.createVariable(out_var, var_out_data.dtype, out_dim_names)
# assign the attributes
nc_out_var.setncatts(nc_in_var.__dict__)
# remove the grid mapping and coordinates from the dictionary if they exist and process is not all
if process != "all":
if "grid_mapping" in nc_out_var.ncattrs():
del nc_out_var.grid_mapping
if "coordinates" in nc_out_var.ncattrs():
del nc_out_var.coordinates
if "cell_method" in nc_out_var.ncattrs():
nc_out_var.cell_method += ", area: " + process + " "
# check for rotated pole and copy variable if it exists
if "grid_mapping" in nc_out_var.ncattrs() and len(out_dims) == 4:
grid_map_name = getattr(nc_out_var,"grid_mapping")
grid_map_var = nc_in_file.variables[grid_map_name]
grid_map_out_var = nc_out_file.createVariable(grid_map_name, 'c', ())
grid_map_out_var.setncatts(grid_map_var.__dict__)
# get the global longitude / global latitude vars
coords = getattr(nc_out_var,"coordinates").split(" ")
global_lon_var = nc_in_file.variables[coords[0]]
global_lat_var = nc_in_file.variables[coords[1]]
global_lon_data = global_lon_var[lon_lat_idxs[1]:lon_lat_idxs[3], lon_lat_idxs[0]:lon_lat_idxs[2]]
global_lat_data = global_lat_var[lon_lat_idxs[1]:lon_lat_idxs[3], lon_lat_idxs[0]:lon_lat_idxs[2]]
# create the global latitude / global longitude variables
out_global_lon_var = nc_out_file.createVariable(coords[0], global_lon_data.dtype, (out_dims[2][0], out_dims[3][0]))
out_global_lon_var[:] = global_lon_data
out_global_lat_var = nc_out_file.createVariable(coords[1], global_lat_data.dtype, (out_dims[2][0], out_dims[3][0]))
out_global_lat_var[:] = global_lat_data
out_global_lon_var.setncatts(global_lon_var.__dict__)
out_global_lat_var.setncatts(global_lat_var.__dict__)
# assign the data
nc_out_var[:] = var_out_data
# Append data
if append:
# find time dimension to append to
for var in nc_out_file.variables.keys():
if var[:4]=='time':
out_time=var
for d in out_dims:
if d[0][:4]=='time':
dim_out_var = nc_out_file.variables[out_time]
tlen=len(dim_out_var)
dim_out_var[tlen:] = d[1][:]
nc_out_var = nc_out_file.variables[out_var]
nc_out_var[tlen:,:]=var_out_data[:]
nc_out_file.close()
nc_in_file.close()
except Exception,e:
print 'Failed to create netcdf file'#,os.path.basename(out_name)
print e
# raise
if os.path.exists(out_name):
os.remove(out_name)
return False
return True
# Add months to datetime object
def add_months(date,months):
yr = date.year
mon = date.month
new_month = ((mon-1)+months)%12 +1
new_year = yr + ((mon-1)+months)/12
newdate = date.replace(year=new_year,month=new_month)
return newdate
###############################################################################
def extract_local(taskpath, field_list, output_dir, temp_dir, zipstart, zipend):
boinc=taskpath.split('/')[-1]
# Set up dictionary for list of extracted files (by file stream)
extracted_netcdfs={}
for field in field_list:
extracted_netcdfs[field[0]]=[]
#loop over zipfiles and extract netcdf files
try:
for i in range(zipstart,zipend+1):
path=os.path.join(taskpath,boinc+'_'+str(i)+'.zip')
# open and extract contents of zip
zf = zipfile.ZipFile(path,'r')
zf.extractall(temp_dir)
# list the zip contents
zf_list = zf.namelist()
# Loop over files in zip and check if we are going to use them
for zf_file in zf_list:
found=False
fname=temp_dir + "/" + zf_file
if not os.path.exists(fname):
raise Exception("file not extracted successfully")
for field in field_list:
file_stream = field[0]
if file_stream in zf_file and zf_file[-3:]=='.nc':
if fname not in extracted_netcdfs[file_stream]:
extracted_netcdfs[file_stream].append(fname)
found=True
if not found: # We are not using this file
os.remove(fname)
zf.close()
except Exception,e:
print "Could not extract file: " + path
print e
# Clean up extracted files
for nc_list in extracted_netcdfs.itervalues():
for fname in nc_list:
os.remove(fname)
return False
# Success
return extracted_netcdfs
###############################################################################
def extract_url(taskurl, field_list, output_dir, temp_dir, zipstart, zipend):
boinc=taskurl.split('/')[-1]
# Set up dictionary for list of extracted files (by file stream)
extracted_netcdfs={}
for field in field_list:
extracted_netcdfs[field[0]]=[]
#loop over zipfiles and extract netcdf files
try:
for i in range(zipstart,zipend+1):
url=os.path.join(taskurl,boinc+'_'+str(i)+'.zip')
# print url
# check whether this has already been processed # TODO, change this to check each zip rather than folder!
# Download zip and extract files
zf_fh = tempfile.NamedTemporaryFile(mode='w', delete=False,dir=temp_dir)
urlret = urllib.urlretrieve(url, zf_fh.name)
if urlret[1].gettype()!='application/zip':
raise Exception('Error downloading url: '+url)
zf = zipfile.ZipFile(zf_fh.name,'r')
zf.extractall(temp_dir)
# list the zip contents
zf_list = zf.namelist()
# Loop over files in zip and check if we are going to use them
for zf_file in zf_list:
found=False
fname=temp_dir + "/" + zf_file
if not os.path.exists(fname):
raise Exception("file not extracted successfully")
for field in field_list:
file_stream = field[0]
if file_stream in zf_file and zf_file[-3:]=='.nc':
if fname not in extracted_netcdfs[file_stream]:
extracted_netcdfs[file_stream].append(fname)
found=True
if not found: # We are not using this file
os.remove(fname)
# Remove downloaded zipfile and clean up file handles
zf.close()
os.remove(zf_fh.name)
zf_fh.close()
except Exception,e:
print "Could not extract url: "
print e
# Clean up extracted files
for nc_list in extracted_netcdfs.itervalues():
for fname in nc_list:
os.remove(fname)
# Remove downloaded zipfile and clean up file handle
if os.path.exists(zf_fh.name):
os.remove(zf_fh.name)
if not zf_fh.closed:
zf_fh.close()
# raise
return False
# Success
# print base_path,extracted_netcdfs
return extracted_netcdfs
###############################################################################
def read_urls(urls_file):
fh = gzip.open(urls_file, 'r')
urls = fh.readlines()
fh.close()
urls = map(string.strip, urls)
return urls
#####################################################
def compress_netcdf(fname):
os.system('nccopy -d 2 -s '+fname+' '+fname+'_compressed')
shutil.move(fname+'_compressed',fname)