rjb_msc_paper_code/filepaths.py at main · rowanjb/rjb_msc_paper_code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#identifies and removes gridT, gridU, and gridV files that give "all nan slice" errors
#it's slow but you only need to run it once
#Rowan Brown
#May 8, 2023

import xarray as xr
import os

def filepaths(run):
    """Identifies and removes gridT, gridU, and gridV files that give "all nan slice" errors.
    Sometimes--rarely--the files get corrupted! Esp. true after Graham (HPC) crashed in early '24.
    Filepaths are saved in text files for security reasons.
    All model output files here refer to copies on Graham.
    Run should be a string e.g., 'EPM151' or 'EPM155' etc.
    Note that sometimes this isn't even enough, and you might have missing or corrupted data
    for certain edge cases, such as when when trying to access vooxy or vodic data. This would
    require you to test for that edge case in the code below."""

    # Directory of filepath txt files
    fp_dir = '../filepaths/'

    # Directory of ANHA4 output files
    with open(fp_dir + 'ANHA4_graham_output.txt') as f: lines = f.readlines()
    ANHA4_graham_output_dir = [line.strip() for line in lines][0]

    # Directory of the run's nemo output files on graham
    nemo_output_dir = ANHA4_graham_output_dir + '/ANHA4-' + run + '-S/'

    # List of filepaths
    filepaths_gridT = sorted([nemo_output_dir + file for file in os.listdir(nemo_output_dir) if file.endswith('gridT.nc')])
    filepaths_gridU = sorted([nemo_output_dir + file for file in os.listdir(nemo_output_dir) if file.endswith('gridU.nc')])
    filepaths_gridV = sorted([nemo_output_dir + file for file in os.listdir(nemo_output_dir) if file.endswith('gridV.nc')])
    filepaths_gridB = sorted([nemo_output_dir + file for file in os.listdir(nemo_output_dir) if file.endswith('gridB.nc')])
    filepaths_gridW = sorted([nemo_output_dir + file for file in os.listdir(nemo_output_dir) if file.endswith('gridW.nc')])
    filepaths_icebergs = sorted([nemo_output_dir + file for file in os.listdir(nemo_output_dir) if file.endswith('icebergs.nc')])
    filepaths_icemod = sorted([nemo_output_dir + file for file in os.listdir(nemo_output_dir) if file.endswith('icemod.nc')])

    # Testing if gridT files are read-able
    bad_files = [] # Initializing list of bad filepaths
    for filepath in filepaths_gridT:
        try:
            DS = xr.open_dataset(filepath)
        except:
            bad_files.append(filepath[:-8]) # Saving any bad filepaths
            print('gridT: ' + filepath)

    # Testing if gridU files are read-able
    for filepath in filepaths_gridU:
        try:
            DS = xr.open_dataset(filepath)
        except:
            bad_files.append(filepath[:-8]) # Saving any bad filepaths
            print('gridU: ' + filepath)

    # Testing if gridV files are read-able
    for filepath in filepaths_gridV:
        try:
            DS = xr.open_dataset(filepath)
        except:
            bad_files.append(filepath[:-8]) # Saving any bad filepaths
            print('gridV: ' + filepath)

    # Testing if gridV files are read-able
    for filepath in filepaths_gridW:
        try:
            DS = xr.open_dataset(filepath)
        except:
            bad_files.append(filepath[:-8]) # Saving any bad filepaths
            print('gridW: ' + filepath)

    # Testing if icemod files are read-able
    for filepath in filepaths_icemod:
        try:
            DS = xr.open_dataset(filepath)
        except:
            bad_files.append(filepath[:-9]) # Saving any bad filepaths
            print('icemod: ' + filepath)

    # Testing if iceberg files are read-able
    for filepath in filepaths_icebergs:
        try:
            DS = xr.open_dataset(filepath)
        except:
            bad_files.append(filepath[:-11]) # Saving any bad filepaths
            print('icebergs: ' + filepath)

    # Removing duplicates from the list
    bad_files = list( dict.fromkeys(bad_files) )

    # Removing bad filepaths
    for bad_file in bad_files:
        print(bad_file + ' is a bad file')
        filepaths_gridT.remove(bad_file + 'gridT.nc')
        filepaths_gridU.remove(bad_file + 'gridU.nc')
        filepaths_gridV.remove(bad_file + 'gridV.nc')
        filepaths_gridB.remove(bad_file + 'gridB.nc')
        filepaths_gridW.remove(bad_file + 'gridW.nc')
        filepaths_icebergs.remove(bad_file + 'icebergs.nc')
        filepaths_icemod.remove(bad_file + 'icemod.nc')

    # Saving the filepaths as txt files
    with open(fp_dir + run + '_gridT_filepaths_jul2025.txt', 'w') as output:
        for i in filepaths_gridT:
            output.write(str(i) + '\n')
    with open(fp_dir + run + '_gridU_filepaths_jul2025.txt', 'w') as output:
        for i in filepaths_gridU:
            output.write(str(i) + '\n')
    with open(fp_dir + run + '_gridV_filepaths_jul2025.txt', 'w') as output:
        for i in filepaths_gridV:
            output.write(str(i) + '\n')
    with open(fp_dir + run + '_gridB_filepaths_jul2025.txt', 'w') as output:
        for i in filepaths_gridB:
            output.write(str(i) + '\n')
    with open(fp_dir + run + '_gridW_filepaths_jul2025.txt', 'w') as output:
        for i in filepaths_gridW:
            output.write(str(i) + '\n')
    with open(fp_dir + run + '_icebergs_filepaths_jul2025.txt', 'w') as output:
        for i in filepaths_icebergs:
            output.write(str(i) + '\n')
    with open(fp_dir + run + '_icemod_filepaths_jul2025.txt', 'w') as output:
        for i in filepaths_icemod:
            output.write(str(i) + '\n')

if __name__ == '__main__':
    for run in ['EPM151','EPM152','EPM155','EPM156','EPM157','EPM158']:
        filepaths(run)