From 4e7dd180455e58149fbc1ef65a4f815db9e6a5a4 Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Sat, 20 Sep 2025 21:54:30 +0700 Subject: [PATCH 1/4] Add test --- .../tests/io/parser/common/test_chunksize.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 75ec96409bdd0..b29a4c9ce5688 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -381,3 +381,24 @@ def test_chunksize_second_block_shorter(all_parsers): for i, result in enumerate(result_chunks): tm.assert_frame_equal(result, expected_frames[i]) + + +def test_chunksize_skip_bad_line_with_bad_line_first_in_the_chunk(all_parsers): + parser = all_parsers + data = "a,b\n1,2\n3\n4,5,extra\n6,7" + + if parser.engine == "pyarrow": + msg = "The 'chunksize' option is not supported with the 'pyarrow' engine" + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=2, on_bad_lines="skip") + return + + result_chunks = parser.read_csv(StringIO(data), chunksize=2, on_bad_lines="skip") + + expected_frames = [ + DataFrame({"a": [1, 3], "b": [2, np.nan]}), + DataFrame({"a": [6], "b": [7]}, index=[2]), + ] + + for i, result in enumerate(result_chunks): + tm.assert_frame_equal(result, expected_frames[i]) From 6247580616a58f92795d05f19d9c3f835ca98bb4 Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Sat, 20 Sep 2025 21:56:18 +0700 Subject: [PATCH 2/4] Add github issue --- pandas/tests/io/parser/common/test_chunksize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index b29a4c9ce5688..045444cc2ca85 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -384,6 +384,7 @@ def test_chunksize_second_block_shorter(all_parsers): def test_chunksize_skip_bad_line_with_bad_line_first_in_the_chunk(all_parsers): + # GH#61973 parser = all_parsers data = "a,b\n1,2\n3\n4,5,extra\n6,7" From 396e6decea7eaf2b99634a79a16c36692323e034 Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Sat, 20 Sep 2025 21:57:44 +0700 Subject: [PATCH 3/4] Fix. Add the failing case --- pandas/_libs/src/parser/tokenizer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 61e96fc835e4d..ebd427e5d20c3 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -413,8 +413,10 @@ static int end_line(parser_t *self) { return 0; } - if (!(self->lines <= self->header_end + 1) && (fields > ex_fields) && - !(self->usecols)) { + bool past_headers = (self->lines > self->header_end + 1) || + (self->lines == 1 && self->header_end == 0); + + if (past_headers && (fields > ex_fields) && !(self->usecols)) { // increment file line count self->file_lines++; From 16aaa1a305cc7d421310fc9aa5d1b23af9492ffe Mon Sep 17 00:00:00 2001 From: Quang Nguyen Date: Sat, 20 Sep 2025 22:03:29 +0700 Subject: [PATCH 4/4] add whatsnew entry --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 45bf83c3c030d..b1297e9b3fa77 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1021,6 +1021,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) +- Bug in :meth:`read_csv` with ``on_bad_lines="skip"`` and ``chunksize`` where the bad line would not be skipped if it was the first line in the chunk(:issue:`61973`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)