diff --git a/cdlparser.py b/cdlparser.py index 11ad19b..7f1d518 100644 --- a/cdlparser.py +++ b/cdlparser.py @@ -67,15 +67,24 @@ Creator: Phil Bentley """ +from __future__ import print_function + __version_info__ = (0, 0, 8, 'beta', 0) __version__ = "%d.%d.%d-%s" % __version_info__[0:4] +import codecs import sys, os, logging, types +import six +import re import ply.lex as lex from ply.lex import TOKEN import ply.yacc as yacc import netCDF4 as nc4 import numpy as np +from functools import reduce + +if not six.PY2: + long = int # default fill values for netCDF-3 data types (as defined in netcdf.h include file) NC_FILL_BYTE = np.int8(-127) @@ -171,9 +180,8 @@ def parse_file(self, cdlfile, ncfile=None) : :returns: A handle to a netCDF4.Dataset object. """ self.cdlfile = cdlfile - f = open(cdlfile) - data = f.read() # FIXME: can we parse input w/o reading entire CDL file into memory? - f.close() + with codecs.open(cdlfile, encoding="utf-8") as f: + data = f.read() # FIXME: can we parse input w/o reading entire CDL file into memory? return self.parse_text(data, ncfile=ncfile) def parse_text(self, cdltext, ncfile=None) : @@ -188,7 +196,8 @@ def parse_text(self, cdltext, ncfile=None) : Alternatively, this can be done immediately upon completion of parsing by setting the close_on_completion keyword argument to True when instantiating the CDLParser instance. - :param cdltext: String containing the CDL text to parse. + :param cdltext: String containing the CDL text to parse. Must be unicode str if containing + unicode. :param ncfile: Optional pathname of the netCDF file to receive output. :returns: A handle to a netCDF4.Dataset object. """ @@ -376,6 +385,7 @@ def t_DOUBLE_CONST(self, t) : def t_SHORT_CONST(self, t) : r'[+-]?([0-9]+|0[xX][0-9a-fA-F]+)[sS]' #r'[+-]?[0-9]+[sS]|0[xX][0-9a-fA-F]+[sS]' # original regex in ncgen3.l file + t.value = fix_octal(t.value) try : int_val = int(eval(t.value[:-1])) except : @@ -391,6 +401,7 @@ def t_SHORT_CONST(self, t) : def t_BYTE_CONST(self, t) : #r'[+-]?[0-9]+[Bb]' # modified regex #r'[+-]?[0-9]*[0-9][Bb]' # original regex in ncgen3.l file + t.value = fix_octal(t.value) try : if t.value[0] == "'" : int_val = ord(eval(t.value)) @@ -411,6 +422,7 @@ def t_INT_CONST(self, t) : r'[+-]?([1-9][0-9]*|0[xX]?[0-9a-fA-F]+|0)' # [Ll] suffix has been deprecated #r'[+-]?([1-9][0-9]*|0)[lL]?' # original regex for decimal integers in ncgen3.l file #r'0[xX]?[0-9a-fA-F]+[lL]?' # original regex for octal or hex integers in ncgen3.l file + t.value = fix_octal(t.value) try : long_val = long(eval(t.value)) except : @@ -418,7 +430,7 @@ def t_INT_CONST(self, t) : raise CDLContentError(errmsg) if long_val < XDR_INT_MIN or long_val > XDR_INT_MAX : errmsg = "Integer constant outside valid range (%d -> %d): %s" \ - % (XDR_INT_MIN, XDR_INT_MAX, int_val) + % (XDR_INT_MIN, XDR_INT_MAX, long_val) raise CDLContentError(errmsg) else : t.value = np.int32(long_val) @@ -472,7 +484,7 @@ def p_dimdecl(self, p) : | dimd EQUALS DOUBLE_CONST | dimd EQUALS NC_UNLIMITED_K""" dimname = "" - if isinstance(p[3], basestring) : + if isinstance(p[3], six.string_types) : if p[3] == "unlimited" : if self.rec_dimname : raise CDLContentError("Only one UNLIMITED dimension is allowed.") @@ -639,7 +651,7 @@ def p_datadecl(self, p) : try : self.write_var_data(var, arr) self.logger.info("Wrote %d data value(s) for variable %s" % (len(arr), p[1])) - except Exception, exc : + except Exception as exc : self.logger.error(str(exc)) raise @@ -772,7 +784,7 @@ def write_var_data(self, var, arr) : arrlen = len(arr) varlen = var.size if is_charvar and var.ndim > 0 : - varlen /= var.shape[-1] + varlen = varlen // var.shape[-1] reclen = 0 self.logger.debug("Length of passed-in data array = %d" % arrlen) if varlen : self.logger.debug("Expected length of variable = %d" % varlen) @@ -782,7 +794,7 @@ def write_var_data(self, var, arr) : if is_recvar : rec_dimlen = len(self.ncdataset.dimensions[self.rec_dimname]) if rec_dimlen > 0 : # record dimension has been set to non-zero - reclen = varlen / rec_dimlen + reclen = varlen // rec_dimlen else : # record dimension is still equal to zero varlen = arrlen reclen = 1 @@ -806,7 +818,7 @@ def write_var_data(self, var, arr) : put_char_data(var, arr, reclen) else : put_numeric_data(var, arr, reclen) - except Exception, exc : + except Exception as exc : errmsg = "Error attempting to write data array for variable %s\n" % var._name errmsg += "Exception details are as follows:\n%s" % str(exc) raise CDLContentError(errmsg) @@ -814,12 +826,12 @@ def write_var_data(self, var, arr) : def _lextest(self, data) : """private method - for test purposes only""" self.lexer.input(data) - print "-----" + print("-----") while 1 : t = self.lexer.token() if not t : break - print "type: %-15s\tvalue: %s" % (t.type, t.value) - print "-----" + print("type: %-15s\tvalue: %s" % (t.type, t.value)) + print("-----") #--------------------------------------------------------------------------------------------------- def put_numeric_data(var, arr, reclen=0) : @@ -827,7 +839,7 @@ def put_numeric_data(var, arr, reclen=0) : """Write numeric data array to netcdf variable.""" nparr = np.array(arr, dtype=var.dtype) shape = list(var.shape) - if reclen : shape[0] = len(arr) / reclen + if reclen : shape[0] = len(arr) // reclen nparr.shape = shape var[:] = nparr @@ -838,7 +850,7 @@ def put_char_data(var, arr, reclen=0) : maxlen = var.shape[-1] if var.ndim > 0 else 1 nparr = str_list_to_char_arr(arr, maxlen) shape = list(var.shape) - if reclen : shape[0] = len(arr) / reclen + if reclen : shape[0] = len(arr) // reclen nparr.shape = shape var[:] = nparr @@ -892,14 +904,43 @@ def deescapify(name) : i += 1 return newname + +# Regex for finding escape sequences +ESCAPE_SEQUENCE_RE = re.compile(r''' + ( \\U........ # 8-digit hex escapes + | \\u.... # 4-digit hex escapes + | \\x.. # 2-digit hex escapes + | \\[0-7]{1,3} # Octal escapes + | \\N\{[^}]+\} # Unicode characters by name + | \\[\\'"abfnrtv] # Single-character escapes + )''', re.UNICODE | re.VERBOSE) + #--------------------------------------------------------------------------------------------------- def expand_escapes(tstring) : #--------------------------------------------------------------------------------------------------- """ - A Python version of ncgen's expand_escapes() function (see escapes.c). This function simply - uses the built-in string.decode() method. + Function to convert escapes to actual (unicode) characters. Fulfills the same purpose as + expand_escapes() in ncgen3/escapes.c or unescape() in ncgen/escapes.c. + Input string containing unicode must be a unicode string. + https://stackoverflow.com/a/24519338/2196270 + """ + def decode_match(match): + return codecs.decode(match.group(0), 'unicode-escape') + + return ESCAPE_SEQUENCE_RE.sub(decode_match, tstring) + +#--------------------------------------------------------------------------------------------------- +def fix_octal(octal_str) : +#--------------------------------------------------------------------------------------------------- + """ + Fixes anything octal, including +/- prefix and letter suffix to use "0o" """ - return tstring.decode('string_escape') + m = re.match(r"([+-]?)0(\d+.*)", octal_str) + if m: + # Make octal python 3 compatible + return m.group(1) + "0o" + m.group(2) + else: + return octal_str #--------------------------------------------------------------------------------------------------- def get_default_fill_value(datatype) : @@ -926,7 +967,7 @@ def main() : """Rudimentary main function - primarily for testing purposes at this point in time.""" debug = 0 if len(sys.argv) < 2 : - print "usage: python cdlparser.py cdlfile [keyword=value, ...]" + print("usage: python cdlparser.py cdlfile [keyword=value, ...]") sys.exit(1) cdlfile = sys.argv[1] kwargs = {} diff --git a/test/test_charvars.py b/test/test_charvars.py index e2e5788..ca95da5 100644 --- a/test/test_charvars.py +++ b/test/test_charvars.py @@ -50,7 +50,7 @@ def tearDown(self) : def test_scalar_variables(self) : var = self.dataset.variables['letter'] - self.assertTrue(var[:] == "X") + self.assertTrue(var[:] == b"X") def test_non_scalar_variables(self) : var = self.dataset.variables['regcodes'] diff --git a/test/test_constants.py b/test/test_constants.py index 639c932..41c6991 100644 --- a/test/test_constants.py +++ b/test/test_constants.py @@ -17,6 +17,9 @@ def setUp(self) : variables: float var1(dim1) ; var1:att1 = "dummy attribute" ; + // FillValue necessary to enable masking in NETCDF3_CLASSIC right now. + // See https://github.com/Unidata/netcdf4-python/issues/725. + var1:_FillValue = 9.9692099683868690e+36; // global attributes :c1 = "foo" ; // with spaces :c2="bar" ; // w/o spaces @@ -111,13 +114,13 @@ def test_double_array(self) : def test_dimensions(self) : self.assertTrue(len(self.dataset.dimensions) == 1) - self.assertTrue(self.dataset.dimensions.keys()[0] == "dim1") + self.assertTrue('dim1' in self.dataset.dimensions.keys()) dim = self.dataset.dimensions['dim1'] self.assertTrue(len(dim) == 3) def test_variables(self) : self.assertTrue(len(self.dataset.variables) == 1) - self.assertTrue(self.dataset.variables.keys()[0] == "var1") + self.assertTrue("var1" in self.dataset.variables.keys()) var = self.dataset.variables['var1'] self.assertTrue(var.att1 == "dummy attribute") data = var[:]