Skip to content

Commit 5af55e0

Browse files
authored
BUG: Missing value code not recognised for Stata format version 105 a… (#59325)
* BUG: Missing value code not recognised for Stata format version 105 and earlier * Move definition of the old missing value constant for the double type out of the loop
1 parent 0e0814b commit 5af55e0

File tree

11 files changed

+38
-11
lines changed

11 files changed

+38
-11
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,7 @@ I/O
584584
- Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
585585
- Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
586586
- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
587+
- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)
587588

588589
Period
589590
^^^^^^

pandas/io/stata.py

+9
Original file line numberDiff line numberDiff line change
@@ -1817,10 +1817,19 @@ def read(
18171817
return data
18181818

18191819
def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
1820+
# missing code for double was different in version 105 and prior
1821+
old_missingdouble = float.fromhex("0x1.0p333")
1822+
18201823
# Check for missing values, and replace if found
18211824
replacements = {}
18221825
for i in range(len(data.columns)):
18231826
fmt = self._typlist[i]
1827+
# recode instances of the old missing code to the currently used value
1828+
if self._format_version <= 105 and fmt == "d":
1829+
data.iloc[:, i] = data.iloc[:, i].replace(
1830+
old_missingdouble, self.MISSING_VALUES["d"]
1831+
)
1832+
18241833
if self._format_version <= 111:
18251834
if fmt not in self.OLD_VALID_RANGE:
18261835
continue
362 Bytes
Binary file not shown.
364 Bytes
Binary file not shown.
363 Bytes
Binary file not shown.
409 Bytes
Binary file not shown.
362 Bytes
Binary file not shown.
364 Bytes
Binary file not shown.
363 Bytes
Binary file not shown.
409 Bytes
Binary file not shown.

pandas/tests/io/test_stata.py

+28-11
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,9 @@ def test_read_index_col_none(self, version, temp_file):
120120
expected["a"] = expected["a"].astype(np.int32)
121121
tm.assert_frame_equal(read_df, expected, check_index_type=True)
122122

123-
# Note this test starts at format version 108 as the missing code for double
124-
# was different prior to this (see GH 58149) and would therefore fail
125-
@pytest.mark.parametrize("version", [108, 110, 111, 113, 114, 115, 117, 118, 119])
123+
@pytest.mark.parametrize(
124+
"version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119]
125+
)
126126
def test_read_dta1(self, version, datapath):
127127
file = datapath("io", "data", "stata", f"stata1_{version}.dta")
128128
parsed = self.read_dta(file)
@@ -918,8 +918,8 @@ def test_missing_value_generator(self, temp_file):
918918
)
919919
assert val.string == ".z"
920920

921-
@pytest.mark.parametrize("file", ["stata8_113", "stata8_115", "stata8_117"])
922-
def test_missing_value_conversion(self, file, datapath):
921+
@pytest.mark.parametrize("version", [113, 115, 117])
922+
def test_missing_value_conversion(self, version, datapath):
923923
columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
924924
smv = StataMissingValue(101)
925925
keys = sorted(smv.MISSING_VALUES.keys())
@@ -930,14 +930,13 @@ def test_missing_value_conversion(self, file, datapath):
930930
expected = DataFrame(data, columns=columns)
931931

932932
parsed = read_stata(
933-
datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
933+
datapath("io", "data", "stata", f"stata8_{version}.dta"),
934+
convert_missing=True,
934935
)
935936
tm.assert_frame_equal(parsed, expected)
936937

937-
# Note this test starts at format version 108 as the missing code for double
938-
# was different prior to this (see GH 58149) and would therefore fail
939-
@pytest.mark.parametrize("file", ["stata8_108", "stata8_110", "stata8_111"])
940-
def test_missing_value_conversion_compat(self, file, datapath):
938+
@pytest.mark.parametrize("version", [104, 105, 108, 110, 111])
939+
def test_missing_value_conversion_compat(self, version, datapath):
941940
columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
942941
smv = StataMissingValue(101)
943942
keys = sorted(smv.MISSING_VALUES.keys())
@@ -947,7 +946,25 @@ def test_missing_value_conversion_compat(self, file, datapath):
947946
expected = DataFrame(data, columns=columns)
948947

949948
parsed = read_stata(
950-
datapath("io", "data", "stata", f"{file}.dta"), convert_missing=True
949+
datapath("io", "data", "stata", f"stata8_{version}.dta"),
950+
convert_missing=True,
951+
)
952+
tm.assert_frame_equal(parsed, expected)
953+
954+
# The byte type was not supported prior to the 104 format
955+
@pytest.mark.parametrize("version", [102, 103])
956+
def test_missing_value_conversion_compat_nobyte(self, version, datapath):
957+
columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
958+
smv = StataMissingValue(101)
959+
keys = sorted(smv.MISSING_VALUES.keys())
960+
data = []
961+
row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]]
962+
data.append(row)
963+
expected = DataFrame(data, columns=columns)
964+
965+
parsed = read_stata(
966+
datapath("io", "data", "stata", f"stata8_{version}.dta"),
967+
convert_missing=True,
951968
)
952969
tm.assert_frame_equal(parsed, expected)
953970

0 commit comments

Comments
 (0)