271 corrupt cdf generation from imap code (#287)

bryan-harter · web-flow · commit 12288f88346b · 2025-01-13T12:56:12.000-07:00
* Fixing a bug that would allow corrupt cdf files generated

* Converting strings to numbers if necessary

* Making older versions of numpy append strings easier
diff --git a/cdflib/cdfwrite.py b/cdflib/cdfwrite.py
@@ -870,30 +870,33 @@ def _write_var_attrs(self, f: io.BufferedWriter, varNum: int, var_attrs: Dict[st
                 if items == 2:
                     dataType = self._datatype_token(entry[1])
 
+            # Handle user setting datatype
             if dataType > 0:
                 # CDF data type defined in entry
                 data = entry[0]
                 if self._checklistofNums(data):
-                    # All are numbers
+                    # Data needs no pre-processing and is good to go
                     if hasattr(data, "__len__") and not isinstance(data, str):
                         numElems = len(data)
                     else:
                         numElems = 1
                 else:
-                    # Then string(s) -- either in CDF_type or epoch in string(s)
+                    # Data needs some sort of pre-processing to proceed
                     if dataType == self.CDF_CHAR or dataType == self.CDF_UCHAR:
                         if hasattr(data, "__len__") and not isinstance(data, str):
+                            # Reformat strings
                             items = len(data)
                             odata = data
                             data = ""
                             for x in range(0, items):
                                 if x > 0:
                                     data += "\\N "
-                                    data += odata[x]
+                                    data += str(odata[x])
                                 else:
-                                    data = odata[x]
+                                    data = str(odata[x])
                         numElems = len(data)
                     elif dataType == self.CDF_EPOCH or dataType == self.CDF_EPOCH16 or dataType == self.CDF_TIME_TT2000:
+                        # Convert data to CDF time
                         cvalue = []
                         if hasattr(data, "__len__") and not isinstance(data, str):
                             numElems = len(data)
@@ -903,7 +906,22 @@ def _write_var_attrs(self, f: io.BufferedWriter, varNum: int, var_attrs: Dict[st
                         else:
                             data = cdfepoch.CDFepoch.parse(data)
                             numElems = 1
-            else:
+                    elif isinstance(data, str):
+                        # One possibility is that the user wants to convert a string to a number
+                        numElems = 1
+                        data = np.array(float(data))
+                    else:
+                        # The final possibility I can think of is that the user wants to convert a list of strings to a list of numbers
+                        try:
+                            numElems = 1
+                            data = np.array([float(item) for item in data])
+                        except:
+                            logger.warning(
+                                f"Cannot determine how to convert {str(data)} to specified type of {dataType}. Ignoring the specified datatype, and continuing."
+                            )
+                        dataType = 0
+
+            if dataType == 0:
                 # No data type defined...
                 data = entry
                 if hasattr(data, "__len__") and not isinstance(data, str):
@@ -913,9 +931,9 @@ def _write_var_attrs(self, f: io.BufferedWriter, varNum: int, var_attrs: Dict[st
                         for x in range(0, len(entry)):
                             if x > 0:
                                 data += "\\N "
-                                data += entry[x]
+                                data += str(entry[x])
                             else:
-                                data = entry[x]
+                                data = str(entry[x])
                     numElems = len(data)
                 else:
                     numElems, dataType = self._datatype_define(entry)
@@ -1750,7 +1768,7 @@ def _write_aedr(
             value_size = 1
             cdata = "\x00".encode()
         else:
-            value_size = len(cdata)
+            value_size = recs * self._datatype_size(dataType, numElems)
         block_size = value_size + 56
         aedr = bytearray(block_size)
         aedr[0:8] = struct.pack(">q", block_size)
@@ -2304,6 +2322,17 @@ def _convert_data(self, data_type: int, num_elems: int, num_values: int, indata:
                         odata += adata.ljust(num_elems, "\x00")
                 recs = int((size * size2) / num_values)
                 return recs, odata.encode()
+            elif all(isinstance(item, str) for item in indata):
+                # Attempt to convert to a numpy array of numbers
+                try:
+                    return self._numpy_to_bytes(data_type, num_values, num_elems, np.array([float(item) for item in indata]))
+                except:
+                    # Do the best we can, create bytes from the string.
+                    # It will probably come out to be jibberish
+                    outdata = ("".join(indata)).ljust(num_elems, "\x00").encode()
+                    recs = int(len(outdata) / recSize)
+                    return recs, outdata
+
             else:
                 try:
                     return self._numpy_to_bytes(data_type, num_values, num_elems, np.array(indata))
@@ -2366,8 +2395,19 @@ def _convert_data(self, data_type: int, num_elems: int, num_values: int, indata:
                 return recs, odata.encode()
             else:
                 return self._numpy_to_bytes(data_type, num_values, num_elems, indata)
-        elif isinstance(indata, str):
+        elif isinstance(indata, str) and (data_type == self.CDF_CHAR or data_type == self.CDF_UCHAR):
+            # Just convert the string directly to bytes
             return 1, indata.ljust(num_elems, "\x00").encode()
+        elif isinstance(indata, str) and data_type != self.CDF_CHAR and data_type == self.CDF_UCHAR:
+            # Try to convert the single string to a numerical type.
+            try:
+                return self._numpy_to_bytes(data_type, num_values, num_elems, np.array([float(indata)]))
+            except:
+                # Do the best we can, create bytes from the string.
+                # It will probably come out to be jibberish
+                outdata = indata.ljust(num_elems, "\x00").encode()
+                recs = int(len(outdata) / recSize)
+                return recs, outdata
         else:
             try:
                 # Try converting the data to numpy
@@ -2398,7 +2438,7 @@ def _convert_data(self, data_type: int, num_elems: int, num_values: int, indata:
                     else:
                         return recs, struct.pack(form, indata)
                 except struct.error:
-                    raise ValueError("Unable to convert data to CDF format, data " "object cannot be of type string.")
+                    raise ValueError("Unable to convert data to CDF format, data object cannot be of type string.")
 
     def _num_values(self, zVar: bool, varNum: int) -> int:
         """
diff --git a/tests/test_cdfwrite.py b/tests/test_cdfwrite.py
@@ -5,6 +5,7 @@
 import pytest
 
 from cdflib import cdfread, cdfwrite
+from cdflib.xarray import cdf_to_xarray
 
 R = Path(__file__).parent
 fnbasic = "testing.cdf"
@@ -639,3 +640,45 @@ def test_convert_data_error(tmp_path):
     with pytest.raises(ValueError):
         # Data from list of strings with dimension "epoch"
         cdf._convert_data(51, 1, 1, indata)
+
+
+def test_string_input_but_number_type(tmp_path):
+    # This small example used to create a corrupted CDF file.
+    # Because the FILLVAL was input as a string, but it is told to be a double
+    cdf = cdfwrite.CDF(tmp_path / "test.cdf")
+    var_data = np.random.rand(5, 3) * 30
+    var_spec = {
+        "Variable": "temperature",
+        "Data_Type": 45,
+        "Num_Elements": 1,
+        "Rec_Vary": False,
+        "Dim_Sizes": [5, 3],
+        "Compress": 0,
+    }
+    var_att_dict = {"FILLVAL": [np.str_("12"), "CDF_DOUBLE"]}
+    cdf.write_var(var_spec, var_attrs=var_att_dict, var_data=var_data)
+    cdf.close()
+
+    # Reading it back in would cause an error
+    cdf_to_xarray(tmp_path / "test.cdf")
+
+
+def test_array_string_input_but_number_type(tmp_path):
+    # This small example used to create a corrupted CDF file.
+    # Because the FILLVAL was input as a string, but it is told to be a double
+    cdf = cdfwrite.CDF(tmp_path / "test.cdf")
+    var_data = np.random.rand(5, 3) * 30
+    var_spec = {
+        "Variable": "temperature",
+        "Data_Type": 45,
+        "Num_Elements": 1,
+        "Rec_Vary": False,
+        "Dim_Sizes": [5, 3],
+        "Compress": 0,
+    }
+    var_att_dict = {"FILLVAL": [np.array([np.str_("12"), np.str_("13")]), "CDF_DOUBLE"]}
+    cdf.write_var(var_spec, var_attrs=var_att_dict, var_data=var_data)
+    cdf.close()
+
+    # Reading it back in would cause an error
+    cdf_to_xarray(tmp_path / "test.cdf")