Skip to content

Commit 12288f8

Browse files
authored
271 corrupt cdf generation from imap code (#287)
* Fixing a bug that would allow corrupt cdf files generated * Converting strings to numbers if necessary * Making older versions of numpy append strings easier
1 parent 5d8ae33 commit 12288f8

2 files changed

Lines changed: 93 additions & 10 deletions

File tree

cdflib/cdfwrite.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -870,30 +870,33 @@ def _write_var_attrs(self, f: io.BufferedWriter, varNum: int, var_attrs: Dict[st
870870
if items == 2:
871871
dataType = self._datatype_token(entry[1])
872872

873+
# Handle user setting datatype
873874
if dataType > 0:
874875
# CDF data type defined in entry
875876
data = entry[0]
876877
if self._checklistofNums(data):
877-
# All are numbers
878+
# Data needs no pre-processing and is good to go
878879
if hasattr(data, "__len__") and not isinstance(data, str):
879880
numElems = len(data)
880881
else:
881882
numElems = 1
882883
else:
883-
# Then string(s) -- either in CDF_type or epoch in string(s)
884+
# Data needs some sort of pre-processing to proceed
884885
if dataType == self.CDF_CHAR or dataType == self.CDF_UCHAR:
885886
if hasattr(data, "__len__") and not isinstance(data, str):
887+
# Reformat strings
886888
items = len(data)
887889
odata = data
888890
data = ""
889891
for x in range(0, items):
890892
if x > 0:
891893
data += "\\N "
892-
data += odata[x]
894+
data += str(odata[x])
893895
else:
894-
data = odata[x]
896+
data = str(odata[x])
895897
numElems = len(data)
896898
elif dataType == self.CDF_EPOCH or dataType == self.CDF_EPOCH16 or dataType == self.CDF_TIME_TT2000:
899+
# Convert data to CDF time
897900
cvalue = []
898901
if hasattr(data, "__len__") and not isinstance(data, str):
899902
numElems = len(data)
@@ -903,7 +906,22 @@ def _write_var_attrs(self, f: io.BufferedWriter, varNum: int, var_attrs: Dict[st
903906
else:
904907
data = cdfepoch.CDFepoch.parse(data)
905908
numElems = 1
906-
else:
909+
elif isinstance(data, str):
910+
# One possibility is that the user wants to convert a string to a number
911+
numElems = 1
912+
data = np.array(float(data))
913+
else:
914+
# The final possibility I can think of is that the user wants to convert a list of strings to a list of numbers
915+
try:
916+
numElems = 1
917+
data = np.array([float(item) for item in data])
918+
except:
919+
logger.warning(
920+
f"Cannot determine how to convert {str(data)} to specified type of {dataType}. Ignoring the specified datatype, and continuing."
921+
)
922+
dataType = 0
923+
924+
if dataType == 0:
907925
# No data type defined...
908926
data = entry
909927
if hasattr(data, "__len__") and not isinstance(data, str):
@@ -913,9 +931,9 @@ def _write_var_attrs(self, f: io.BufferedWriter, varNum: int, var_attrs: Dict[st
913931
for x in range(0, len(entry)):
914932
if x > 0:
915933
data += "\\N "
916-
data += entry[x]
934+
data += str(entry[x])
917935
else:
918-
data = entry[x]
936+
data = str(entry[x])
919937
numElems = len(data)
920938
else:
921939
numElems, dataType = self._datatype_define(entry)
@@ -1750,7 +1768,7 @@ def _write_aedr(
17501768
value_size = 1
17511769
cdata = "\x00".encode()
17521770
else:
1753-
value_size = len(cdata)
1771+
value_size = recs * self._datatype_size(dataType, numElems)
17541772
block_size = value_size + 56
17551773
aedr = bytearray(block_size)
17561774
aedr[0:8] = struct.pack(">q", block_size)
@@ -2304,6 +2322,17 @@ def _convert_data(self, data_type: int, num_elems: int, num_values: int, indata:
23042322
odata += adata.ljust(num_elems, "\x00")
23052323
recs = int((size * size2) / num_values)
23062324
return recs, odata.encode()
2325+
elif all(isinstance(item, str) for item in indata):
2326+
# Attempt to convert to a numpy array of numbers
2327+
try:
2328+
return self._numpy_to_bytes(data_type, num_values, num_elems, np.array([float(item) for item in indata]))
2329+
except:
2330+
# Do the best we can, create bytes from the string.
2331+
# It will probably come out to be jibberish
2332+
outdata = ("".join(indata)).ljust(num_elems, "\x00").encode()
2333+
recs = int(len(outdata) / recSize)
2334+
return recs, outdata
2335+
23072336
else:
23082337
try:
23092338
return self._numpy_to_bytes(data_type, num_values, num_elems, np.array(indata))
@@ -2366,8 +2395,19 @@ def _convert_data(self, data_type: int, num_elems: int, num_values: int, indata:
23662395
return recs, odata.encode()
23672396
else:
23682397
return self._numpy_to_bytes(data_type, num_values, num_elems, indata)
2369-
elif isinstance(indata, str):
2398+
elif isinstance(indata, str) and (data_type == self.CDF_CHAR or data_type == self.CDF_UCHAR):
2399+
# Just convert the string directly to bytes
23702400
return 1, indata.ljust(num_elems, "\x00").encode()
2401+
elif isinstance(indata, str) and data_type != self.CDF_CHAR and data_type == self.CDF_UCHAR:
2402+
# Try to convert the single string to a numerical type.
2403+
try:
2404+
return self._numpy_to_bytes(data_type, num_values, num_elems, np.array([float(indata)]))
2405+
except:
2406+
# Do the best we can, create bytes from the string.
2407+
# It will probably come out to be jibberish
2408+
outdata = indata.ljust(num_elems, "\x00").encode()
2409+
recs = int(len(outdata) / recSize)
2410+
return recs, outdata
23712411
else:
23722412
try:
23732413
# Try converting the data to numpy
@@ -2398,7 +2438,7 @@ def _convert_data(self, data_type: int, num_elems: int, num_values: int, indata:
23982438
else:
23992439
return recs, struct.pack(form, indata)
24002440
except struct.error:
2401-
raise ValueError("Unable to convert data to CDF format, data " "object cannot be of type string.")
2441+
raise ValueError("Unable to convert data to CDF format, data object cannot be of type string.")
24022442

24032443
def _num_values(self, zVar: bool, varNum: int) -> int:
24042444
"""

tests/test_cdfwrite.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pytest
66

77
from cdflib import cdfread, cdfwrite
8+
from cdflib.xarray import cdf_to_xarray
89

910
R = Path(__file__).parent
1011
fnbasic = "testing.cdf"
@@ -639,3 +640,45 @@ def test_convert_data_error(tmp_path):
639640
with pytest.raises(ValueError):
640641
# Data from list of strings with dimension "epoch"
641642
cdf._convert_data(51, 1, 1, indata)
643+
644+
645+
def test_string_input_but_number_type(tmp_path):
646+
# This small example used to create a corrupted CDF file.
647+
# Because the FILLVAL was input as a string, but it is told to be a double
648+
cdf = cdfwrite.CDF(tmp_path / "test.cdf")
649+
var_data = np.random.rand(5, 3) * 30
650+
var_spec = {
651+
"Variable": "temperature",
652+
"Data_Type": 45,
653+
"Num_Elements": 1,
654+
"Rec_Vary": False,
655+
"Dim_Sizes": [5, 3],
656+
"Compress": 0,
657+
}
658+
var_att_dict = {"FILLVAL": [np.str_("12"), "CDF_DOUBLE"]}
659+
cdf.write_var(var_spec, var_attrs=var_att_dict, var_data=var_data)
660+
cdf.close()
661+
662+
# Reading it back in would cause an error
663+
cdf_to_xarray(tmp_path / "test.cdf")
664+
665+
666+
def test_array_string_input_but_number_type(tmp_path):
667+
# This small example used to create a corrupted CDF file.
668+
# Because the FILLVAL was input as a string, but it is told to be a double
669+
cdf = cdfwrite.CDF(tmp_path / "test.cdf")
670+
var_data = np.random.rand(5, 3) * 30
671+
var_spec = {
672+
"Variable": "temperature",
673+
"Data_Type": 45,
674+
"Num_Elements": 1,
675+
"Rec_Vary": False,
676+
"Dim_Sizes": [5, 3],
677+
"Compress": 0,
678+
}
679+
var_att_dict = {"FILLVAL": [np.array([np.str_("12"), np.str_("13")]), "CDF_DOUBLE"]}
680+
cdf.write_var(var_spec, var_attrs=var_att_dict, var_data=var_data)
681+
cdf.close()
682+
683+
# Reading it back in would cause an error
684+
cdf_to_xarray(tmp_path / "test.cdf")

0 commit comments

Comments
 (0)