From 95a84ad6559f4180039dac0dc07dcd1b4d7756ee Mon Sep 17 00:00:00 2001 From: Taus Date: Fri, 19 Sep 2025 15:06:46 +0000 Subject: [PATCH 1/2] Python: Fix false positive for unmatchable dollar/caret Our previous modelling did not account for the fact that a lookahead can potentially extend all the way to the end of the input (and similarly, that a lookbehind can extend all the way to the beginning). To fix this, I extended `firstPart` and `lastPart` to handle lookbehinds and lookaheads correctly, and added some test cases (all of which yield no new results). Fixes #20429. --- .../semmle/python/regexp/RegexTreeView.qll | 8 +-- .../python/regexp/internal/ParseRegExp.qll | 70 ++++++++++++------- ...atchable-dollar-and-caret-in-assertions.md | 5 ++ .../query-tests/Expressions/Regex/test.py | 10 ++- 4 files changed, 64 insertions(+), 29 deletions(-) create mode 100644 python/ql/src/change-notes/2025-09-19-fix-unmatchable-dollar-and-caret-in-assertions.md diff --git a/python/ql/lib/semmle/python/regexp/RegexTreeView.qll b/python/ql/lib/semmle/python/regexp/RegexTreeView.qll index a2952a5680bc..897c97bb7831 100644 --- a/python/ql/lib/semmle/python/regexp/RegexTreeView.qll +++ b/python/ql/lib/semmle/python/regexp/RegexTreeView.qll @@ -964,7 +964,7 @@ module Impl implements RegexTreeViewSig { * ``` */ class RegExpPositiveLookahead extends RegExpLookahead { - RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end) } + RegExpPositiveLookahead() { re.positiveLookaheadAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpPositiveLookahead" } } @@ -979,7 +979,7 @@ module Impl implements RegexTreeViewSig { * ``` */ additional class RegExpNegativeLookahead extends RegExpLookahead { - RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end) } + RegExpNegativeLookahead() { re.negativeLookaheadAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpNegativeLookahead" } } @@ -1006,7 +1006,7 @@ module Impl implements RegexTreeViewSig { * ``` */ class RegExpPositiveLookbehind extends RegExpLookbehind { - RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end) } + RegExpPositiveLookbehind() { re.positiveLookbehindAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpPositiveLookbehind" } } @@ -1021,7 +1021,7 @@ module Impl implements RegexTreeViewSig { * ``` */ additional class RegExpNegativeLookbehind extends RegExpLookbehind { - RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end) } + RegExpNegativeLookbehind() { re.negativeLookbehindAssertionGroup(start, end, _, _) } override string getPrimaryQLClass() { result = "RegExpNegativeLookbehind" } } diff --git a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll index 7e23554e0589..d91c4bbd78c0 100644 --- a/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll +++ b/python/ql/lib/semmle/python/regexp/internal/ParseRegExp.qll @@ -554,9 +554,9 @@ class RegExp extends Expr instanceof StringLiteral { or this.negativeAssertionGroup(start, end) or - this.positiveLookaheadAssertionGroup(start, end) + this.positiveLookaheadAssertionGroup(start, end, _, _) or - this.positiveLookbehindAssertionGroup(start, end) + this.positiveLookbehindAssertionGroup(start, end, _, _) } /** Holds if an empty group is found between `start` and `end`. */ @@ -572,7 +572,7 @@ class RegExp extends Expr instanceof StringLiteral { or this.negativeAssertionGroup(start, end) or - this.positiveLookaheadAssertionGroup(start, end) + this.positiveLookaheadAssertionGroup(start, end, _, _) } private predicate emptyMatchAtEndGroup(int start, int end) { @@ -580,7 +580,7 @@ class RegExp extends Expr instanceof StringLiteral { or this.negativeAssertionGroup(start, end) or - this.positiveLookbehindAssertionGroup(start, end) + this.positiveLookbehindAssertionGroup(start, end, _, _) } private predicate negativeAssertionGroup(int start, int end) { @@ -593,32 +593,40 @@ class RegExp extends Expr instanceof StringLiteral { ) } - /** Holds if a negative lookahead is found between `start` and `end` */ - predicate negativeLookaheadAssertionGroup(int start, int end) { - exists(int in_start | this.negative_lookahead_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a negative lookahead is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate negativeLookaheadAssertionGroup(int start, int end, int in_start, int in_end) { + this.negative_lookahead_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } - /** Holds if a negative lookbehind is found between `start` and `end` */ - predicate negativeLookbehindAssertionGroup(int start, int end) { - exists(int in_start | this.negative_lookbehind_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a negative lookbehind is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate negativeLookbehindAssertionGroup(int start, int end, int in_start, int in_end) { + this.negative_lookbehind_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } - /** Holds if a positive lookahead is found between `start` and `end` */ - predicate positiveLookaheadAssertionGroup(int start, int end) { - exists(int in_start | this.lookahead_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a positive lookahead is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate positiveLookaheadAssertionGroup(int start, int end, int in_start, int in_end) { + this.lookahead_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } - /** Holds if a positive lookbehind is found between `start` and `end` */ - predicate positiveLookbehindAssertionGroup(int start, int end) { - exists(int in_start | this.lookbehind_assertion_start(start, in_start) | - this.groupContents(start, end, in_start, _) - ) + /** + * Holds if a positive lookbehind is found between `start` and `end`, with contents + * between `in_start` and `in_end`. + */ + predicate positiveLookbehindAssertionGroup(int start, int end, int in_start, int in_end) { + this.lookbehind_assertion_start(start, in_start) and + this.groupContents(start, end, in_start, in_end) } private predicate group_start(int start, int end) { @@ -1049,6 +1057,13 @@ class RegExp extends Expr instanceof StringLiteral { or this.alternationOption(x, y, start, end) ) + or + // Lookbehind assertions can potentially match the start of the string + ( + this.positiveLookbehindAssertionGroup(_, _, start, _) or + this.negativeLookbehindAssertionGroup(_, _, start, _) + ) and + this.item(start, end) } /** A part of the regex that may match the end of the string. */ @@ -1074,6 +1089,13 @@ class RegExp extends Expr instanceof StringLiteral { or this.alternationOption(x, y, start, end) ) + or + // Lookahead assertions can potentially match the end of the string + ( + this.positiveLookaheadAssertionGroup(_, _, _, end) or + this.negativeLookaheadAssertionGroup(_, _, _, end) + ) and + this.item(start, end) } /** diff --git a/python/ql/src/change-notes/2025-09-19-fix-unmatchable-dollar-and-caret-in-assertions.md b/python/ql/src/change-notes/2025-09-19-fix-unmatchable-dollar-and-caret-in-assertions.md new file mode 100644 index 000000000000..cf63dd9ed4da --- /dev/null +++ b/python/ql/src/change-notes/2025-09-19-fix-unmatchable-dollar-and-caret-in-assertions.md @@ -0,0 +1,5 @@ +--- +category: minorAnalysis +--- + +- The queries that check for unmatchable `$` and `^` in regular expressions did not account correctly for occurrences inside lookahead and lookbehind assertions. These occurrences are now handled correctly, eliminating this source of false positives. diff --git a/python/ql/test/query-tests/Expressions/Regex/test.py b/python/ql/test/query-tests/Expressions/Regex/test.py index 6dadbccb4b65..717663e335c5 100644 --- a/python/ql/test/query-tests/Expressions/Regex/test.py +++ b/python/ql/test/query-tests/Expressions/Regex/test.py @@ -150,4 +150,12 @@ re.compile(r"[\u0000-\uFFFF]") #Allow unicode names -re.compile(r"[\N{degree sign}\N{EM DASH}]") \ No newline at end of file +re.compile(r"[\N{degree sign}\N{EM DASH}]") + +#Lookahead assertions. None of these are unmatchable dollars: +re.compile(r"^(?=a$)[ab]") +re.compile(r"^(?!a$)[ab]") + +#Lookbehind assertions. None of these are unmatchable carets: +re.compile(r"(?<=^a)a") +re.compile(r"(? Date: Fri, 19 Sep 2025 15:39:12 +0000 Subject: [PATCH 2/2] Python: Update test output --- python/ql/test/library-tests/regex/FirstLast.expected | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/ql/test/library-tests/regex/FirstLast.expected b/python/ql/test/library-tests/regex/FirstLast.expected index b187033ee22d..0abf9c790c24 100644 --- a/python/ql/test/library-tests/regex/FirstLast.expected +++ b/python/ql/test/library-tests/regex/FirstLast.expected @@ -4,6 +4,7 @@ | (?!not-this)^[A-Z_]+$ | first | 12 | 13 | | (?!not-this)^[A-Z_]+$ | first | 13 | 19 | | (?!not-this)^[A-Z_]+$ | first | 13 | 20 | +| (?!not-this)^[A-Z_]+$ | last | 3 | 11 | | (?!not-this)^[A-Z_]+$ | last | 13 | 19 | | (?!not-this)^[A-Z_]+$ | last | 13 | 20 | | (?!not-this)^[A-Z_]+$ | last | 20 | 21 | @@ -101,6 +102,7 @@ | ^[A-Z_]+$(?