diff --git a/ISSUE-457-TOOLS/COMPACT_ERRORS.py b/ISSUE-457-TOOLS/COMPACT_ERRORS.py new file mode 100644 index 000000000..bcc6f412e --- /dev/null +++ b/ISSUE-457-TOOLS/COMPACT_ERRORS.py @@ -0,0 +1,46 @@ +import re +import argparse +import numpy as np + + +def do_the_work(file_name): + with open(file_name, "r") as fh: + in_text = fh.readlines() + + err_dict = {} + for line in in_text: + line = line.strip() + if line: + if line.startswith("/home"): + version = re.search(r"(?<=names/)\d{1,2}(?=/src)", line) + version = version.group() + else: + line = re.sub(r"Line \d+? : ", "", line) + if line in err_dict.keys(): + err_dict[line].append(version) + else: + err_dict[line] = [version] + out_dict = {} + for line, version_list in err_dict.items(): + v0 = int(version_list[0]) + version_string = ", ".join(version_list) + text = f"{version_string} | {line}" + if v0 in out_dict: + out_dict[v0].append(text) + else: + out_dict[v0] = [text] + for v0 in range(1, 84): + text_list = out_dict.pop(v0, "") + for text in text_list: + print(text) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="compact_errors", + description=("\nCompact error lists to show in which version each error occur.") + ) + parser.add_argument("-f", "--file_name", type=str, + help="Name of input error file") + args = parser.parse_args() + + do_the_work(args.file_name) diff --git a/ISSUE-457-TOOLS/LIST_ERRORS.py b/ISSUE-457-TOOLS/LIST_ERRORS.py new file mode 100644 index 000000000..27bc3c41e --- /dev/null +++ b/ISSUE-457-TOOLS/LIST_ERRORS.py @@ -0,0 +1,154 @@ +import os +import re +import argparse +import urllib.request +from io import BytesIO + +from cfunits import Units as cfUnits +from cf_units import Unit as uuUnits +from lxml import etree + + + +PATH0 = "../" +BASE_PATH = PATH0 + "cf-conventions/cf-convention.github.io/Data/cf-standard-names/" + +def parse_xml(xml_raw): + try: + xml_tree = etree.parse(BytesIO(xml_raw)) + except etree.XMLSyntaxError: + print(f"{':'*100}\n{xml_raw[:1000]}") + raise + return xml_tree + + +def get_schema(xml_tree): + root = xml_tree.getroot() + xsd_uri = root.values()[0] + link = urllib.request.urlopen(xsd_uri) + xsd_raw = link.read() + xsd_tree = parse_xml(xsd_raw) + schema = etree.XMLSchema(xsd_tree) + return schema + + +def find_xml_errors(xml_tree, schema, xml_raw): + try: + schema.assertValid(xml_tree) + print(" ---- Valid and Well-formed") + except etree.DocumentInvalid: + xml_list = xml_raw.split(b"\n") + for error in schema.error_log: + for element in ["description", "canonical_units"]: + if f"( {element}" in error.message: + std_name = xml_list[error.line - 1].split(b'"')[1].decode("utf-8)") + print(f"Line {error.line} : Standard name entry for '{std_name}' has no <{element}>") + break + else: + print(f"Line {error.line} : {error.message}") + + +def check_units(can_units, std_name): + uu = cfUnits(can_units) + if not uu.isvalid: + print(f"Canonical units '{can_units}' is not accepted by CF-UNITS for '{std_name}'") + else: + try: + uu = uuUnits(can_units) + if " -" in can_units: + try: + uu = uuUnits(can_units.replace(" -", "-")) + print(f"Canonical units '{can_units}' has a spurious space for '{std_name}'") + except ValueError: + print(f"Canonical unit '{can_units}' is really weird for '{std_name}'") + elif "/" in can_units: + print(f"Canonical units '{can_units}' used '/' for '{std_name}'") + except ValueError: + print(f"Canonical unit '{can_units} is a special CF unit for '{std_name}'") + + +def find_missing_and_duplicates(xml_raw, old_entry_list, old_alias_list): + def _extract_entries(xml_raw): + entry_list = [] + for entry in re.finditer(rb'.+?', xml_raw, re.S): + e = re.search(rb'(?<=\").+?(?=\")', entry.group()) + std_name = e.group().decode("utf-8") + entry_list.append(std_name) + can_units = re.search(rb'(?<=_units>).+?(?=', xml_raw, re.S): + alias_from = re.search(rb'(?<=\").+?(?=\")', alias.group()) + alias_to = re.search(rb'(?<=entry_id>).+?(?= 0: + version_list = range(1, args.version + 1) + else: + version_list = [args.version] + + entry_list = [] + alias_list = [] + for version in version_list: + try: + if version != 38: + print("\n") + entry_list, alias_list = do_the_work( + version, args.severity, entry_list, alias_list + ) + except: + break + print() diff --git a/ISSUE-457-TOOLS/STEP_1-2.py b/ISSUE-457-TOOLS/STEP_1-2.py new file mode 100644 index 000000000..801bd9573 --- /dev/null +++ b/ISSUE-457-TOOLS/STEP_1-2.py @@ -0,0 +1,102 @@ +# -*- coding: -*- + +import re +from datetime import datetime, UTC +from pathlib import Path + +MY_PATH = "../" +BASE_PATH = MY_PATH + "cf-conventions/cf-convention.github.io/Data/cf-standard-names/" +# NEW_XSD = b"../../schema-files/cf-standard-name-table-2.0.xsd" +NEW_XSD = (b"https://raw.githubusercontent.com/larsbarring/cf-convention.github.io/" + b"test-all-issue-457/Data/schema-files/cf-standard-name-table-2.0.xsd") + +def fix_v1_datetime(xml_raw): + txt1 = b">1\n" + txt2 = txt1 + b" 2002-04-02T12:00:00Z\n" + xml_raw = xml_raw.replace(txt1, txt2) + print("ADDED : DATETIME in version 1") + return xml_raw + + +def fix_v71_datetime(xml_raw): + if b"2020-02-04T12:00Z" in xml_raw: + xml_raw = xml_raw.replace(b"2020-02-04T12:00Z", b"2020-02-04T12:00:00Z") + print("FIXED : DATETIME in version 71") + return xml_raw + +def fix_v12_duplicate_entry(xml_raw): + pat = rb'\n *.+? *?(?=\n)' + xml_raw = re.sub(pat, b"", xml_raw, 1, re.S) + print("FIXED : Removed first duplicate of 'sea_surface_height_above_reference_ellipsoid'") + return xml_raw + + +def add_modified_date(xml_raw): + time_stamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8") + modified = b"last_modified" + modified_start = b"<" + modified + b">" + modified_end = modified_start.replace(b"<", b"" + n = len( inst_text) + inst = re.search((b"\n( *)" + inst_text), xml_raw) + spaces = inst.group()[1: -n] + position = inst.span()[0] + xml_raw = xml_raw[:position] + b"\n" + spaces + modified_element + xml_raw[position:] + print("ADDED : MODIFIED DATE") + return xml_raw + + +def do_the_work(version): + xml_original = f"{BASE_PATH}{version}/src/cf-standard-name-table.xml" + xml_saved = xml_original.replace("-table", "-table__SAVED") + + my_file = Path(xml_saved) + if my_file.is_file(): + # work on original files that are already saved + with open(xml_saved, "rb") as fh: + xml_raw = fh.read() + print(f"READING SAVED ORIGINAL FILE: {xml_original}") + else: + # work on original files that have not yet been saved + with open(xml_original, "rb") as fh: + xml_raw = fh.read() + # then save the original before changing the original + with open(xml_saved, "wb") as fh: + fh.write(xml_raw) + print(f"READING AND SAVING ORIGINAL FILE: {xml_original}") + + if xml_raw[:6] != b"\n' + xml_raw + print("ADDED : '") + for old_xsd in [b"CFStandardNameTable-1.0.xsd", + b"CFStandardNameTable-1.1.xsd", + b"cf-standard-name-table-1.1.xsd"]: + if old_xsd in xml_raw: + xml_raw = xml_raw.replace(old_xsd, NEW_XSD) + print(f"CHANGED : XSD FILE NAME {old_xsd.decode('utf-8')} --> {NEW_XSD.decode('utf-8')}") + + if version == 1: + xml_raw = fix_v1_datetime(xml_raw) + elif version == 12: + xml_raw = fix_v12_duplicate_entry(xml_raw) + elif version == 71: + xml_raw = fix_v71_datetime(xml_raw) + + xml_raw = xml_raw.replace(b"last_modified", b"first_published_date") + print("CHANGED : 'last_modified' --> 'first_published_date'") + + xml_raw = add_modified_date(xml_raw) + + with open(xml_original, "wb") as fh: + fh.write(xml_raw) + + +if __name__ == "__main__": + for version in range(1, 100): + try: + if version != 38: + print("\n") + do_the_work(version) + except: + break diff --git a/ISSUE-457-TOOLS/STEP_3-4.py b/ISSUE-457-TOOLS/STEP_3-4.py new file mode 100644 index 000000000..3e6e21059 --- /dev/null +++ b/ISSUE-457-TOOLS/STEP_3-4.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +import re +from datetime import datetime, UTC +from pathlib import Path + + +MY_PATH = "../" +BASE_PATH = MY_PATH + "cf-conventions/cf-convention.github.io/Data/cf-standard-names/" +NL = b"\n" + + +def find_duplicate_aliases(xml_raw): + pat = (rb' +? *?\n') + result = [r for r in re.finditer(pat, xml_raw, re.S)] + if len(result) > 1: + collected_entries = [] + for k, r in enumerate(result): + lines = r.group().splitlines() + for s in lines: + if b"" in s and s not in collected_entries: + collected_entries.append(s) + new_alias = [] + for line in result[0].group().splitlines(): + if b"entry_id" in line: + new_alias.extend(collected_entries) + elif line: + new_alias.append(line) + _ = [print(f' {line.decode("utf-8")}') for line in new_alias] + result_0 = NL.join(new_alias) + for r in reversed(result[1:]): + span = r.span() + xml_raw = xml_raw[: span[0]] + xml_raw[span[1]: ] + span = result[0].span() + xml_raw = xml_raw[: span[0]] + NL + NL.join(new_alias) + NL + xml_raw[span[1]: ] + else: + xml_raw = "" + return xml_raw + + +def add_conventions(xml_raw): + pat = rb"\n +?\d+?" + old_elem = re.search(pat, xml_raw) + old_elem = old_elem.group() + version = re.search(rb"\d{1,3}", old_elem) + version = version.group() + new_elem = old_elem + b"\n CF-StandardNameTable-" + version + b"" + xml_raw = xml_raw.replace(old_elem, new_elem) + return xml_raw + + +def update_last_modified(xml_raw): + time_stamp = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ").encode("utf-8") + pat = rb".+?Z" + new = rb"" + time_stamp + rb"" + xml_raw = re.sub(pat, new, xml_raw) + return xml_raw + + +def do_the_work(version): + xml_file = f"{BASE_PATH}{version}/src/cf-standard-name-table.xml" + with open(xml_file, "rb") as fh: + xml_raw = fh.read() + print(f"..../{xml_file[58:]}") + + duplicate_aliases = find_duplicate_aliases(xml_raw) + for std_name in duplicate_aliases: + result = fix_duplicate_aliases(xml_raw, std_name) + if result: + xml_raw = result + else: + print(" No change") + + xml_raw = add_conventions(xml_raw) + + xml_raw = update_last_modified(xml_raw) + with open(xml_file, "wb") as fh: + fh.write(xml_raw) + + +if __name__ == "__main__": + # update_schema() + for version in range(1, 100): + try: + if version != 38: + print("\n") + do_the_work(version) + except: + break diff --git a/ISSUE-457-TOOLS/STEP_5u.py b/ISSUE-457-TOOLS/STEP_5u.py new file mode 100644 index 000000000..2302cc373 --- /dev/null +++ b/ISSUE-457-TOOLS/STEP_5u.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +import re +from datetime import datetime, UTC +from pathlib import Path + + +MY_PATH = "../" +BASE_PATH = MY_PATH + "cf-conventions/cf-convention.github.io/Data/cf-standard-names/" + + +def prettify(line): + fmt = { + "": (0, 1), + "": (2,1), + "": (2,1), + "": (2, 1), + "": (2, 1), + "": (2, 1), + "": (2,3), + "": (4, 1), + "": (4, 1), + "": (4, 1), + "": (4, 1), + "": (2, 2), + "": (2, 2), + "": (0, 1), + } + line2 = line.strip() + for k in fmt.keys(): + if k in line2: + indent, newlines = fmt[k] + pretty = ' '*indent + line2 + '\n'*newlines + return pretty + return line2 + +def cleanup(in_xml): + in_xml = in_xml.replace("\n>", ">") + return in_xml + + +def extract_header(in_xml): + header = re.search(".+?(?=.+?', in_xml, re.S): + entry1 = entry.group() + entry1 = entry1.split("\n") + entry1 = "\n".join([t.strip() for t in entry1 if t]) + e = re.search(r'(?<=\").+?(?=\")', entry1) + std_name = e.group() + entry2 = f'\n' + for t in tags: + payload = re.search(rf"(?<=\<{t}>).*?(?=)", entry1, re.S) + if payload: + p = payload.group() + if t == "description": + p = re.sub(" *\n *", " ", p, re.S) + entry2 += f"<{t}>{p}\n" + elif t in ["canonical_units", "description"]: + entry2 += f"<{t}>\n" + print(f"ADDED: {std_name} '{t}'") + entry2 += " \n\n" + entry_dict[std_name] = entry2 + out_xml = "\n".join([entry_dict[k] for k in maybe_sorted(entry_dict)]) + return out_xml + + +def extract_aliases(in_xml): + alias_dict = {} + for alias in re.finditer(r'', in_xml, re.S): + alias1 = alias.group() + a = re.search(r'(?<=\").+?(?=\")', alias1) + std_name = a.group() + alias1 = "\n".join([a.strip() for a in alias1.split("\n") if a]) + alias_dict[std_name] = alias1 + out_xml = "\n".join([alias_dict[k] for k in maybe_sorted(alias_dict)]) + return out_xml + + +def do_the_work(version): + xml_file = f"{BASE_PATH}{version}/src/cf-standard-name-table.xml" + with open(xml_file, "r") as fh: + in_xml = fh.read() + print(xml_file[58:]) + + in_xml = cleanup(in_xml) + out_xml = extract_header(in_xml) + out_xml += extract_entries(in_xml) + out_xml += extract_aliases(in_xml) + out_xml += "\n\n" + out_xml = out_xml.split("\n") + + with open(xml_file, "w") as fh: + for line in out_xml: + fh.write(prettify(line)) + + #for line in out_xml: # [:100]: + # #print(f"1§{line}§") + # print(prettify(line), end="") + #raise RuntimeError + +if __name__ == "__main__": + for version in range(1, 100): + try: + if version != 38: + print("\n") + do_the_work(version) + except: + break diff --git a/ISSUE-457-TOOLS/restore_files b/ISSUE-457-TOOLS/restore_files new file mode 100755 index 000000000..3857c93ee --- /dev/null +++ b/ISSUE-457-TOOLS/restore_files @@ -0,0 +1,10 @@ +#!/bin/bash + +dir=/home/a001257/CODE/cf-conventions/cf-convention.github.io/Data/cf-standard-names +for d in $(seq 1 84); do + if [[ "$d" != "38" ]]; then + f=$(ls $dir/$d/src/*__SAVED.xml) + mv $f ${f/__SAVED/} + # rm $dir/$d/src/* + fi +done