From f9550c3f7207f946a45bbccd1814b12b136aae72 Mon Sep 17 00:00:00 2001 From: Paul Wankadia Date: Mon, 19 Feb 2024 16:23:29 +0000 Subject: [PATCH] Fix more ancient bugs around Latin-1 handling. It turned out that case folding assumed UTF-8 mode, so we would fold, say, 0xD1 to 0xF1 even in Latin-1 mode. Fixes #477. Change-Id: I73aa5c8e33ee0c6041c54e3a7268635915960f64 Reviewed-on: https://code-review.googlesource.com/c/re2/+/62714 Reviewed-by: Alex Chernyakhovsky Reviewed-by: Paul Wankadia --- re2/parse.cc | 60 ++++++++++++++++++++++++++++----------- re2/testing/dump.cc | 20 +++++++++---- re2/testing/parse_test.cc | 25 +++++++++++++++- 3 files changed, 82 insertions(+), 23 deletions(-) diff --git a/re2/parse.cc b/re2/parse.cc index a02791747..2558b2a2e 100644 --- a/re2/parse.cc +++ b/re2/parse.cc @@ -337,6 +337,20 @@ Rune CycleFoldRune(Rune r) { return ApplyFold(f, r); } +// Add lo-hi to the class, along with their fold-equivalent characters. +static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) { + while (lo <= hi) { + cc->AddRange(lo, lo); + if ('A' <= lo && lo <= 'Z') { + cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a'); + } + if ('a' <= lo && lo <= 'z') { + cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A'); + } + lo++; + } +} + // Add lo-hi to the class, along with their fold-equivalent characters. // If lo-hi is already in the class, assume that the fold-equivalent // chars are there too, so there's no work to do. @@ -394,17 +408,26 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { // Pushes the literal rune r onto the stack. bool Regexp::ParseState::PushLiteral(Rune r) { // Do case folding if needed. - if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { - Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); - re->ccb_ = new CharClassBuilder; - Rune r1 = r; - do { - if (!(flags_ & NeverNL) || r != '\n') { - re->ccb_->AddRange(r, r); - } - r = CycleFoldRune(r); - } while (r != r1); - return PushRegexp(re); + if (flags_ & FoldCase) { + if (flags_ & Latin1 && (('A' <= r && r <= 'Z') || + ('a' <= r && r <= 'z'))) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + AddFoldedRangeLatin1(re->ccb_, r, r); + return PushRegexp(re); + } + if (!(flags_ & Latin1) && CycleFoldRune(r) != r) { + Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); + re->ccb_ = new CharClassBuilder; + Rune r1 = r; + do { + if (!(flags_ & NeverNL) || r != '\n') { + re->ccb_->AddRange(r, r); + } + r = CycleFoldRune(r); + } while (r != r1); + return PushRegexp(re); + } } // Exclude newline if applicable. @@ -1176,7 +1199,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, if (re->op() == kRegexpCharClass) { CharClass* cc = re->cc(); for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) - ccb.AddRange(it->lo, it->hi); + ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags()); } else if (re->op() == kRegexpLiteral) { if (re->parse_flags() & Regexp::FoldCase) { // AddFoldedRange() can terminate prematurely if the character class @@ -1195,7 +1218,7 @@ void FactorAlternationImpl::Round3(Regexp** sub, int nsub, } re->Decref(); } - Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); + Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase); splices->emplace_back(re, sub + start, i - start); } @@ -1623,10 +1646,15 @@ void CharClassBuilder::AddRangeFlags( } // If folding case, add fold-equivalent characters too. - if (parse_flags & Regexp::FoldCase) - AddFoldedRange(this, lo, hi, 0); - else + if (parse_flags & Regexp::FoldCase) { + if (parse_flags & Regexp::Latin1) { + AddFoldedRangeLatin1(this, lo, hi); + } else { + AddFoldedRange(this, lo, hi, 0); + } + } else { AddRange(lo, hi); + } } // Look for a group with the given name. diff --git a/re2/testing/dump.cc b/re2/testing/dump.cc index 5cddd2334..9e3c94a69 100644 --- a/re2/testing/dump.cc +++ b/re2/testing/dump.cc @@ -96,17 +96,25 @@ static void DumpRegexpAppending(Regexp* re, std::string* s) { break; case kRegexpLiteral: { Rune r = re->rune(); - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); + if (re->parse_flags() & Regexp::Latin1) { + s->push_back(r); + } else { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } break; } case kRegexpLiteralString: for (int i = 0; i < re->nrunes(); i++) { Rune r = re->runes()[i]; - char buf[UTFmax+1]; - buf[runetochar(buf, &r)] = 0; - s->append(buf); + if (re->parse_flags() & Regexp::Latin1) { + s->push_back(r); + } else { + char buf[UTFmax+1]; + buf[runetochar(buf, &r)] = 0; + s->append(buf); + } } break; case kRegexpConcat: diff --git a/re2/testing/parse_test.cc b/re2/testing/parse_test.cc index 7684b62a4..95294d5ff 100644 --- a/re2/testing/parse_test.cc +++ b/re2/testing/parse_test.cc @@ -225,6 +225,29 @@ static Test tests[] = { // Bug in Regexp::ToString() that emitted [^], which // would (obviously) fail to parse when fed back in. { "[\\s\\S]", "cc{0-0x10ffff}" }, + + // As per https://github.com/google/re2/issues/477, + // there were long-standing bugs involving Latin-1. + // Here, we exercise it WITHOUT case folding... + { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 }, + { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 }, + { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 }, + { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 }, + { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 }, + { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 }, + { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 }, + { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 }, + // Here, we exercise it WITH case folding... + // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2 + // should fold to 0xF1 and 0xF2, respectively. + { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, + { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase }, }; bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) { @@ -492,7 +515,7 @@ TEST(TestToString, EquivalentParse) { // << " t=" << t << " regexp=" << tests[i].regexp; // Test that if we parse the new regexp we get the same structure. - Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status); + Regexp* nre = Regexp::Parse(t, f, &status); ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text(); std::string ss = nre->Dump(); std::string tt = nre->ToString();