rockdoc · randallpittman · Feb 18, 2019 · Feb 19, 2019 · Feb 25, 2019 · rockdoc
diff --git a/cdlparser.py b/cdlparser.py
@@ -67,15 +67,24 @@
 
 Creator: Phil Bentley
 """
+from __future__ import print_function
+
 __version_info__ = (0, 0, 8, 'beta', 0)
 __version__ = "%d.%d.%d-%s" % __version_info__[0:4]
 
+import codecs
 import sys, os, logging, types
+import six
+import re
 import ply.lex as lex
 from ply.lex import TOKEN
 import ply.yacc as yacc
 import netCDF4 as nc4
 import numpy as np
+from functools import reduce
+
+if not six.PY2:
+   long = int
 
 # default fill values for netCDF-3 data types (as defined in netcdf.h include file)
 NC_FILL_BYTE   = np.int8(-127)
@@ -171,9 +180,8 @@ def parse_file(self, cdlfile, ncfile=None) :
       :returns: A handle to a netCDF4.Dataset object.
       """
       self.cdlfile = cdlfile
-      f = open(cdlfile)
-      data = f.read()   # FIXME: can we parse input w/o reading entire CDL file into memory?
-      f.close()
+      with codecs.open(cdlfile, encoding="utf-8") as f:
+         data = f.read()   # FIXME: can we parse input w/o reading entire CDL file into memory?
       return self.parse_text(data, ncfile=ncfile)
 
    def parse_text(self, cdltext, ncfile=None) :
@@ -188,7 +196,8 @@ def parse_text(self, cdltext, ncfile=None) :
       Alternatively, this can be done immediately upon completion of parsing by setting the
       close_on_completion keyword argument to True when instantiating the CDLParser instance.
 
-      :param cdltext: String containing the CDL text to parse.
+      :param cdltext: String containing the CDL text to parse. Must be unicode str if containing
+                      unicode.
       :param ncfile: Optional pathname of the netCDF file to receive output.
       :returns: A handle to a netCDF4.Dataset object.
       """
@@ -376,6 +385,7 @@ def t_DOUBLE_CONST(self, t) :
    def t_SHORT_CONST(self, t) :
       r'[+-]?([0-9]+|0[xX][0-9a-fA-F]+)[sS]'
       #r'[+-]?[0-9]+[sS]|0[xX][0-9a-fA-F]+[sS]'   # original regex in ncgen3.l file
+      t.value = fix_octal(t.value)
       try :
          int_val = int(eval(t.value[:-1]))
       except :
@@ -391,6 +401,7 @@ def t_SHORT_CONST(self, t) :
    def t_BYTE_CONST(self, t) :
       #r'[+-]?[0-9]+[Bb]'        # modified regex
       #r'[+-]?[0-9]*[0-9][Bb]'   # original regex in ncgen3.l file
+      t.value = fix_octal(t.value)
       try :
          if t.value[0] == "'" :
             int_val = ord(eval(t.value))
@@ -411,14 +422,15 @@ def t_INT_CONST(self, t) :
       r'[+-]?([1-9][0-9]*|0[xX]?[0-9a-fA-F]+|0)'   # [Ll] suffix has been deprecated
       #r'[+-]?([1-9][0-9]*|0)[lL]?' # original regex for decimal integers in ncgen3.l file
       #r'0[xX]?[0-9a-fA-F]+[lL]?'   # original regex for octal or hex integers in ncgen3.l file
+      t.value = fix_octal(t.value)
       try :
          long_val = long(eval(t.value))
       except :
          errmsg = "Bad integer constant: %s" %  t.value
          raise CDLContentError(errmsg)
       if long_val < XDR_INT_MIN or long_val > XDR_INT_MAX :
          errmsg = "Integer constant outside valid range (%d -> %d): %s" \
-            % (XDR_INT_MIN, XDR_INT_MAX, int_val)
+            % (XDR_INT_MIN, XDR_INT_MAX, long_val)
          raise CDLContentError(errmsg)
       else :
          t.value = np.int32(long_val)
@@ -472,7 +484,7 @@ def p_dimdecl(self, p) :
                  | dimd EQUALS DOUBLE_CONST
                  | dimd EQUALS NC_UNLIMITED_K"""
       dimname = ""
-      if isinstance(p[3], basestring) :
+      if isinstance(p[3], six.string_types) :
          if p[3] == "unlimited" :
             if self.rec_dimname :
                raise CDLContentError("Only one UNLIMITED dimension is allowed.")
@@ -639,7 +651,7 @@ def p_datadecl(self, p) :
          try :
             self.write_var_data(var, arr)
             self.logger.info("Wrote %d data value(s) for variable %s" % (len(arr), p[1]))
-         except Exception, exc :
+         except Exception as exc :
             self.logger.error(str(exc))
             raise
 
@@ -772,7 +784,7 @@ def write_var_data(self, var, arr) :
       arrlen = len(arr)
       varlen = var.size
       if is_charvar and var.ndim > 0 :
-         varlen /= var.shape[-1]
+         varlen = varlen // var.shape[-1]
       reclen = 0
       self.logger.debug("Length of passed-in data array = %d" % arrlen)
       if varlen : self.logger.debug("Expected length of variable = %d" % varlen)
@@ -782,7 +794,7 @@ def write_var_data(self, var, arr) :
       if is_recvar :
          rec_dimlen = len(self.ncdataset.dimensions[self.rec_dimname])
          if rec_dimlen > 0 :   # record dimension has been set to non-zero
-            reclen = varlen / rec_dimlen
+            reclen = varlen // rec_dimlen
          else :                # record dimension is still equal to zero
             varlen = arrlen
             reclen = 1
@@ -806,28 +818,28 @@ def write_var_data(self, var, arr) :
             put_char_data(var, arr, reclen)
          else :
             put_numeric_data(var, arr, reclen)
-      except Exception, exc :
+      except Exception as exc :
          errmsg = "Error attempting to write data array for variable %s\n" % var._name
          errmsg += "Exception details are as follows:\n%s" % str(exc)
          raise CDLContentError(errmsg)
 
    def _lextest(self, data) :
       """private method - for test purposes only"""
       self.lexer.input(data)
-      print "-----"
+      print("-----")
       while 1 :
          t = self.lexer.token()
          if not t : break
-         print "type: %-15s\tvalue: %s" % (t.type, t.value)
-      print "-----"
+         print("type: %-15s\tvalue: %s" % (t.type, t.value))
+      print("-----")
 
 #---------------------------------------------------------------------------------------------------
 def put_numeric_data(var, arr, reclen=0) :
 #---------------------------------------------------------------------------------------------------
    """Write numeric data array to netcdf variable."""
    nparr = np.array(arr, dtype=var.dtype)
    shape = list(var.shape)
-   if reclen : shape[0] = len(arr) / reclen
+   if reclen : shape[0] = len(arr) // reclen
    nparr.shape = shape
    var[:] = nparr
 
@@ -838,7 +850,7 @@ def put_char_data(var, arr, reclen=0) :
    maxlen = var.shape[-1] if var.ndim > 0 else 1
    nparr = str_list_to_char_arr(arr, maxlen)
    shape = list(var.shape)
-   if reclen : shape[0] = len(arr) / reclen
+   if reclen : shape[0] = len(arr) // reclen
    nparr.shape = shape
    var[:] = nparr
 
@@ -892,14 +904,43 @@ def deescapify(name) :
       i += 1
    return newname
 
+
+# Regex for finding escape sequences
+ESCAPE_SEQUENCE_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'"abfnrtv]  # Single-character escapes
+   )''', re.UNICODE | re.VERBOSE)
+
 #---------------------------------------------------------------------------------------------------
 def expand_escapes(tstring) :
 #---------------------------------------------------------------------------------------------------
    """
-   A Python version of ncgen's expand_escapes() function (see escapes.c). This function simply
-   uses the built-in string.decode() method.
+   Function to convert escapes to actual (unicode) characters. Fulfills the same purpose as
+      expand_escapes() in ncgen3/escapes.c or unescape() in ncgen/escapes.c.
+   Input string containing unicode must be a unicode string.
+   https://stackoverflow.com/a/24519338/2196270
+   """
+   def decode_match(match):
+      return codecs.decode(match.group(0), 'unicode-escape')
+
+   return ESCAPE_SEQUENCE_RE.sub(decode_match, tstring)
+
+#---------------------------------------------------------------------------------------------------
+def fix_octal(octal_str) :
+#---------------------------------------------------------------------------------------------------
+   """
+   Fixes anything octal, including +/- prefix and letter suffix to use "0o"
    """
-   return tstring.decode('string_escape')
+   m = re.match(r"([+-]?)0(\d+.*)", octal_str)
+   if m:
+      # Make octal python 3 compatible
+      return m.group(1) + "0o" + m.group(2)
+   else:
+      return octal_str
 
 #---------------------------------------------------------------------------------------------------
 def get_default_fill_value(datatype) :
@@ -926,7 +967,7 @@ def main() :
    """Rudimentary main function - primarily for testing purposes at this point in time."""
    debug = 0
    if len(sys.argv) < 2 :
-      print "usage: python cdlparser.py cdlfile [keyword=value, ...]"
+      print("usage: python cdlparser.py cdlfile [keyword=value, ...]")
       sys.exit(1)
    cdlfile = sys.argv[1]
    kwargs = {}

diff --git a/test/test_charvars.py b/test/test_charvars.py
@@ -50,7 +50,7 @@ def tearDown(self) :
 
    def test_scalar_variables(self) :
       var = self.dataset.variables['letter']
-      self.assertTrue(var[:] == "X")
+      self.assertTrue(var[:] == b"X")
 
    def test_non_scalar_variables(self) :
       var = self.dataset.variables['regcodes']

diff --git a/test/test_constants.py b/test/test_constants.py
@@ -17,6 +17,9 @@ def setUp(self) :
          variables:
             float var1(dim1) ;
                var1:att1 = "dummy attribute" ;
+               // FillValue necessary to enable masking in NETCDF3_CLASSIC right now.
+               // See https://github.com/Unidata/netcdf4-python/issues/725.
+               var1:_FillValue = 9.9692099683868690e+36;
          // global attributes
             :c1 = "foo" ;      // with spaces
             :c2="bar" ;        // w/o spaces
@@ -111,13 +114,13 @@ def test_double_array(self) :
 
    def test_dimensions(self) :
       self.assertTrue(len(self.dataset.dimensions) == 1)
-      self.assertTrue(self.dataset.dimensions.keys()[0] == "dim1")
+      self.assertTrue('dim1' in self.dataset.dimensions.keys())
       dim = self.dataset.dimensions['dim1']
       self.assertTrue(len(dim) == 3)
 
    def test_variables(self) :
       self.assertTrue(len(self.dataset.variables) == 1)
-      self.assertTrue(self.dataset.variables.keys()[0] == "var1")
+      self.assertTrue("var1" in self.dataset.variables.keys())
       var = self.dataset.variables['var1']
       self.assertTrue(var.att1 == "dummy attribute")
       data = var[:]