diff --git a/docs/solr-escaping-definition.md b/docs/solr-escaping-definition.md new file mode 100644 index 00000000..02527ac8 --- /dev/null +++ b/docs/solr-escaping-definition.md @@ -0,0 +1,164 @@ +# Solr Escaping Definition + +## Purpose + +This document defines where, when, and how escaping must occur when building Solr/Lucene queries. +Follow this document to avoid producing invalid queries, broken boolean logic, or double-escaping bugs. + +## Core Rule (Read This First) + +Escaping is context-dependent and MUST happen at the final embedding step only. + +❌ Never escape raw user input +❌ Never escape tokens +❌ Never escape before semantics are known +❌ Escaping early destroys intent. + +✅ Escape only when inserting a value into field:( … ) + +## Query Construction Pipeline + +The query builder has four distinct layers. Each layer has a strict responsibility. + +1. Raw input + ↓ +2. Normalization & tokenization + ↓ +3. Semantic expansion + ↓ +4. Context-aware escaping + +### Layer Responsibilities + +1. Raw Input + +Example: `nature, and history` + + - Allowed: + - Trimming + - Unicode normalization + - Validation (balanced quotes, no junk) + - Forbidden: + - Escaping + - Quoting + - Backslashes + +2. Tokenization & Normalization + +**Input**: `nature, and history` +**Tokens**: `["nature,", "and", "history"]` + + - Allowed: + - Splitting into tokens + - Lowercasing (optional) + - Forbidden: + - Escaping tokens + - Detecting boolean meaning + - Adding quotes + +3. Semantic Expansion (NO ESCAPING) + +The semantic structure defines intent, not syntax. + +``` $values = [ + 'onephrase' => '"nature, and history"', + 'and' => 'nature AND and AND history', + 'or' => 'nature OR and OR history', + 'asis' => 'nature, and history', // raw input as tge yser typed it. + 'compressed' => 'nature,andhistory', + 'exactmatcher' => 'natureandhistory', + 'emstartswith' => 'natureandhistory*' +]; +``` + + - Allowed: + - Adding AND / OR + - Adding quotes for phrases + - Adding * for prefixes + - Forbidden: + - Escaping special characters + - Adding backslashes + - Lucene syntax manipulation + +4. Context-Aware Escaping (MANDATORY) + +Escaping happens only here, inside `__buildQueryString()` in `Solr.php`. + +Each semantic value MUST use the correct escaper. + +**Semantic Type Escaper Purpose** +onephrase escapePhrase() Quoted phrase +and, or escapeBoolean() Boolean expression +emstartswith escapePrefix() Prefix query +everything else escapeTerm() Literal term + +## Escaper Contracts + +`escapeTerm(string $input)` + +- Escapes Lucene special characters +- DOES NOT add quotes +- DOES NOT interpret boolean logic + +✅ Correct for: exact terms, IDs, compressed strings +❌ Wrong for: phrases see `escapePhrase()` + +`escapePhrase(string $input)` + +- Removes outer quotes if present +- Escapes inner content +- Re-adds exactly one pair of quotes + +**Input**: `"nature, and history"` +**Output**: `"nature\, and history"` + +❌ Must never double-quote +❌ Must never accept pre-escaped strings + +`escapeBoolean(string $expr)` + +- Preserves AND/OR operators +- Escapes only operands +- MUST NOT quote operands automatically + +**Input**: `nature AND and AND history` +**Output**: `nature AND and AND history` + +(operators preserved, literals escaped as needed) + +❌ Must not introduce "and" +❌ Must not escape operators + +`escapePrefix(string $prefix)` + +- Removes trailing * +- Escapes base term +- Re-adds * + +**Input**: `natureandhistory*` +**Output**: `natureandhistory*` + +❌ Must never allow leading wildcards + +## Explicit Anti-Patterns (DO NOT DO THESE) + +❌ Escaping inside tokenization +❌ Escaping $lookfor directly +❌ Escaping tokens before implode() +❌ Quoting inside escapeBoolean() +❌ Passing pre-escaped strings into escapers +❌ Escaping twice “just to be safe” + +## Maintenance Checklist (Before Any Change) + +Before modifying escaping logic, confirm: + +- Am I escaping only at embedding time? +- Does this escaper handle exactly one semantic role? +- Could this cause double-quoting? +- Could this break boolean operators? +- Do existing tests still pass? +- If unsure → STOP and re-read this document. + +## Summary +Semantics first. Escaping last. Always. \ No newline at end of file diff --git a/playwright/test/fast/bugfix/search-quotes.spec.js b/playwright/test/fast/bugfix/search-quotes.spec.js index d160364a..2e65c2c5 100644 --- a/playwright/test/fast/bugfix/search-quotes.spec.js +++ b/playwright/test/fast/bugfix/search-quotes.spec.js @@ -1,15 +1,5 @@ import { test, expect } from '@playwright/test'; -test('smartquote_problem_talking_catalog', async ({ page, baseURL }) => { - const warning = 'There was a problem talking to the catalog.' - - await page.goto('/Search/Home?lookfor=smart'); // no smart quote, no problem, no warning - await expect(page.getByText(warning)).not.toBeVisible(); - - await page.goto('/Search/Home?lookfor=“smart'); // smart quote, problem, warning - await expect(page.getByText(warning)).toBeVisible(); -}); - test('unbalanced_quotes', async ({ page, baseURL }) => { /* If an odd number of quotes are given in the search string, we remove the last one @@ -34,5 +24,10 @@ test('unbalanced_quotes', async ({ page, baseURL }) => { await page.goto('/Search/Home?lookfor="norfolk""'); // 3 quotes, unbalanced: await expect(page.getByText(warning)).toBeVisible(); // expect warn await expect(page.getByText(chaucer)).toBeVisible(); // results visible - // etc + + // Unbalanced funcy quotes + await page.goto('/Search/Home?lookfor=“norfolk'); // 1 funcy quote, unbalanced: + await expect(page.getByText(warning)).toBeVisible(); // expect warn + await expect(page.getByText(chaucer)).toBeVisible(); // results visible + }); diff --git a/services/Search/Home.php b/services/Search/Home.php index 624e6efc..6e631202 100644 --- a/services/Search/Home.php +++ b/services/Search/Home.php @@ -79,6 +79,7 @@ function launch() } $this->search(); } else { + // Otherwie, display the home page $interface->setPageTitle('Search Home'); $interface->assign('isTheHomePage', true); diff --git a/services/Search/SearchStructure.php b/services/Search/SearchStructure.php index 21c7b24f..0578251b 100644 --- a/services/Search/SearchStructure.php +++ b/services/Search/SearchStructure.php @@ -84,6 +84,7 @@ static function fromHash($hash) $c = __CLASS__; $obj = new $c(true); $obj->_fillFromHash($hash); + return $obj; } @@ -230,6 +231,7 @@ private function _fillFromHash($hash) } elseif (count($ss) == 1) { $index = $ss[0][0]; + if (isset($this->force_standard[$index]) && $this->force_standard[$index]) { $this->use_dismax = false; } @@ -925,6 +927,9 @@ static function displayStrip($v) } return "Between $start and $end"; } else { + // Remove square brackets and double quote from facet display. + // Dec 2025 Note: this is just a UI string, so this replacement should be unnecessary. + // return $v; return preg_replace('/[\[\]\"]/', '', $v); } @@ -1023,6 +1028,7 @@ function facetDisplayName($facet) // Take from http://www.toao.net/48-replacing-smart-quotes-and-em-dashes-in-mysql + // TODO: That function is never used function convert_smart_quotes($text) { // First, replace UTF-8 characters. @@ -1038,14 +1044,37 @@ function convert_smart_quotes($text) return $text; } + /** + * Normalize Unicode quotes to ASCII equivalents. + */ + private function normalizeQuotes(string $str): string + { + $map = [ + // Double quotes + '“' => '"', '”' => '"', '„' => '"', '‟' => '"', + + // Single quotes + '‘' => "'", '’' => "'", '‚' => "'", '‛' => "'", + ]; + + return strtr($str, $map); + } + # Detects an odd number of double quotes and removes the last one. function fix_unbalanced_quotes($str) { + // Normalize smart / Unicode quotes first + $str = $this->normalizeQuotes($str); + + //Count ASCII double quotes if (substr_count($str, '"', 0) % 2 != 0) { - $pos = strrpos($str, "\"", -1); - # So we can inform the user - $this->fixedUnbalancedQuotes = true; - $str = substr_replace($str, '', $pos, 1); + # Find the last occurrence of a quote and remove it + $pos = strrpos($str, '"'); + if ($pos !== false) { + # So we can inform the user + $this->fixedUnbalancedQuotes = true; + $str = substr_replace($str, '', $pos, 1); + } } return $str; } diff --git a/sys/Solr.php b/sys/Solr.php index 55147505..402322e2 100755 --- a/sys/Solr.php +++ b/sys/Solr.php @@ -116,8 +116,6 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) { $action = $ss->action; - - // Add the pagination $args[] = array('start', $start); @@ -129,13 +127,16 @@ function simplesearch($ss, $start = 0, $limit = null, $raw = false) { $args = array_merge($args, $this->spellcheckComponents($ss)); } + // print_r("----- Solr query ------: " . json_encode($args, JSON_UNESCAPED_UNICODE)); + // $raw is always false, so rawSolrSearch is never used if ($raw) { return $this->rawSolrSearch($args, $action); } // Otherwise... $rv = $this->solrSearch($args, $action); + return $rv; } @@ -230,7 +231,8 @@ function facetlist($ss, $fields, $sort = 'index', $skip = 0, $limit = 20) { $values = $body['facet_counts']['facet_fields'][$field]; $rv['values'][$field] = array(); - // skip the hidden ones + // Filter out facet values that match the hidden pattern defined on config.ini. + // e.g. skip hlbgeneral = "hlb_both:^U\.S\. National and" to hide "U.S. National..." facets foreach ($values as $valcnt) { if (isset($hide, $hide[$field])) { foreach ($hide[$field] as $regexp) { @@ -269,14 +271,17 @@ function searchArguments($ss) { * @param SearchStructure $ss A fille-in search structure * @return array An array of (key,value) duples for sending to Solr **/ - + // TODO: Remove this function that is never used because the field type is not defined in conf/dismaxsearchspecs.yaml + // TODO: Check by function used by this that could be removed too. function dismaxSearchArguments($ss) { $rv = array(); // Should just be on "lookfor" and "type" $tvb = isset($ss->search[0]) ? $ss->search[0] : array('all', '*:*'); $type = $tvb[0]; + // $value is the search string $value = $tvb[1]; + // If search is empty/whitespace-only, default to *:* (match-all) if (!preg_match('/\S/', $value)) { $value = '*:*'; } @@ -284,10 +289,10 @@ function dismaxSearchArguments($ss) { $allspecs = yaml_parse_file('conf/dismaxsearchspecs.yaml'); // If the type isn't set, back up to normal arguments + // Lianet's notes: $type is extracted from conf/dismaxsearchspecs.yaml so the function always return the args in searchArguments if (!isset($allspecs[$type])) { $args = $this->searchArguments($ss); - // print_r($args); return $args; } @@ -334,14 +339,36 @@ function standardSearchComponents($ss) { $searchComponents = array(); + // Lianet's notes: conf/searchspecs is the config used to build the Solr query. + // the specs looks like {"callnoletters":{"callnoletters":[["emstartswith",1]]}, ...} $specs = yaml_parse_file('conf/searchspecs.yaml'); $query = ''; - foreach ($ss->search as $tvb) { // Type, Value (keywords), Boolen AND or OR + foreach ($ss->search as $tvb) { // Type, Value (keywords), Boolean AND or OR + + // $tvb looks like: ['title', 'nature, and history', 'AND'] $type = $tvb[0]; + + // TODO: Refactoring to call build_and_or_onephrase only once per search term, not once per type $values = $this->build_and_or_onephrase($tvb[1]); + // $values looks like: + /** + "onephrase":nature, and history, + "and":"nature, AND and AND history", + "or":"nature, OR and OR history", + "asis":"nature, and history", + "compressed":"nature,andhistory", + "exactmatcher":"natureandhistory", + "emstartswith":"natureandhistory*"}" + */ + $bool = isset($tvb[2]) ? $tvb[2] : false; + // Build the component for this type + // Some components are created by build_and_or_onephrase + // Other components are extracted from the specs file + // e.g. if (isset($specs[$type]) && $values) { + // $comp looks like: "(issn:(DS753\\ H827) OR isbn:(DS753\\ H827) OR lccn:(\"DS753 H827\") OR lccn:(DS753\\ H827) OR ctrlnum:(ds753h827) OR rptnum:(\"DS753 H827\") OR rptnum:(DS753\\ H827) OR rptnum:(ds753h827) OR oclc_search:(\"DS753 H827\") OR oclc_search:(ds753h827) OR sdrnum:(\"DS753 H827\") OR sdrnum:(ds753h827) OR ht_id:(\"DS753 H827\") OR ht_id:(ds753h827) OR ht_id:(DS753\\ H827) OR id:(\"DS753 H827\"))" $comp = '(' . $this->__buildQueryString($specs[$type], $values) . ')'; if ($bool) { $comp .= ' ' . $bool . ' '; @@ -356,21 +383,23 @@ function standardSearchComponents($ss) { } $query .= "id:(" . implode(' OR ', $ss->extraIDs()) . ')'; } - $ids = $this->tagIDs($ss); + // Check if the query has content, otherwise use *:* to match all if (preg_match('/\S/', $query)) { $searchComponents[] = array('q', $query); } else { $searchComponents[] = array('q', '*:*'); } - return $searchComponents; } /** Quote a filter value, skipping it if it starts with a '[' (and hence is assumed - * to be a range) + * to be a range). Detect date range + * Respect range queries ([ TO ]) + * Always quotes values + * Use lucene_escape_fq because stricter escaped is required **/ function quoteFilterValue($v) { @@ -378,7 +407,15 @@ function quoteFilterValue($v) { return $v; } else { - return '"' . $v . '"'; + // Escape internal quotes before wrapping + // input: He said "hello, the output: He said \"hello + // $escaped = str_replace('"', '\\"', $v); + // String ready to Solr "He said \"hello" + // examples: + // publishDateRange:"1870\-1879" + // NOT ht_rightscode:"tombstone" + // NOT deleted:"true" + return '"' . $this->lucene_escape_fq($v) . '"'; } } @@ -435,6 +472,7 @@ function filterComponents($ss) { $rv[] = implode(':', array($index, $val)); } } + // print_r("Filter Creator : " . json_encode($rv, JSON_UNESCAPED_UNICODE)); if (count($rv) == 0) { return array(); } @@ -615,6 +653,112 @@ function asGetIDsURL($args) { return $url_base . '?' . implode('&', $params); } + /** + * Escapes the characters must be escaped inside quoted Lucene text + * " → \" + * \ → \\ + * + * Assumes: + * - Input is NOT wrapped in quotes + * - Output will be wrapped by the caller + * This function must never add quotes. + */ + public function escapeLuceneLiteral(string $input): string + { + // Escape only what Lucene requires inside a quoted phrase + return preg_replace('/(["\\\\])/', '\\\\$1', $input); + } + + /** + * Escape a term. + * + * A term is a single word. + * All characters that have a special meaning in a Solr query are escaped. + * It must never introduce quotes or phrases. + * Lucene's Standard Query Parser special characters are: + * + - && || ! ( ) { } [ ] ^ " ~ * ? : \ / + * Escape Lucene special characters for a literal term (NO operators) + * + * @see https://solr.apache.org/guide/the-standard-query-parser.html#escaping-special-characters + * + * @param string $input + * + * @return string + */ + public function escapeTerm(string $input): string + { + + // Escape backslash FIRST to avoid double-escaping + $s = str_replace('\\', '\\\\', $input); + + // Use regex to catch all specials, including spaces, in one go + // The characters are: + - && || ! ( ) { } [ ] ^ " ~ * ? : / + // Note: && and || are handled as single chars & and | here + $pattern = '/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\/\&\|])/'; + + return preg_replace($pattern, '\\\\$1', $s); + } + + /** + * Escape a phrase. + * + * A phrase is a group of words. + * Special characters will be escaped and the phrase will be surrounded by + * double quotes to group the input into a single phrase. + * Checking by double quotes can be removed because the input is always a phrase without quotes. + * Escapes " and \. Never escape booleans, commas, or operators inside a quoted phrase. + * Do mind that you cannot build a complete query first and then pass it to + * this method, the whole query will be escaped. You need to escape only the + * 'content' of your query. + * This function was inspired by Solarium: https://github.com/solariumphp/solarium/blob/d6a55d2c1cbaaa78edfbfd8f9b090978a6b6fd83/src/Core/Query/Helper.php#L93 + * @param string $input + * + * @return string + */ + public function escapePhrase(string $input): string + { + // Remove surrounding quotes if any + if (strlen($input) >= 2 && $input[0] === '"' && substr($input, -1) === '"') { + $input = substr($input, 1, -1); + } + + return '"' . $this->escapeLuceneLiteral($input) . '"'; + } + + /** + * Escape a boolean expression for Lucene boolean search. Use for: term1 AND term2 OR "phrase here" + * Use for: dramatic AND literature + * Operators AND, OR are preserved, everything else is escaped as literal (Terms). No accidental field injection + */ + public function escapeBoolean(string $expr): string { + // Split using AND, OR as delimiters (case-insensitive) + + $tokens = explode(' ', $expr); + $out = []; + foreach ($tokens as $t) { + if ($t === 'AND' || $t === 'OR') { + $out[] = $t; + } else { + $out[] = $this->escapeTerm($t); + } + } + + return implode(' ', $out); + } + + /** + * Escape a prefix query for Lucene prefix search. Use for: prefix* + * Prevents *table + * Escape only what Lucene requires inside a quoted phrase + * Preserves prefix semantics + * @return string The escaped prefix query + */ + public function escapePrefix(string $prefix): string { + + $base = substr($prefix, 0, -1); + return $this->escapeLuceneLiteral($base) . '*'; + } + /** * __buildQueryString -- internal method to build query string from search parameters @@ -629,14 +773,17 @@ function asGetIDsURL($args) { private function __buildQueryString($structure, $values, $joiner = "OR") { + $clauses = array(); foreach ($structure as $field => $clausearray) { - // is_numeric($field) is true iff we've got an un-hashed array, used for grouping + + // is_numeric($field) is true if we've got an un-hashed array, used for grouping if (is_numeric($field)) { // get the op (AND or OR) and weight from the first item $opweight = array_shift($clausearray); $op = $opweight[0]; $weight = $opweight[1]; + $sstring = implode('', array('(', Solr::__buildQueryString($clausearray, $values, $op), ')')); if (isset($weight) && $weight > 0) { $sstring .= '^' . $weight; @@ -649,8 +796,12 @@ private function __buildQueryString($structure, $values, $joiner = "OR") { foreach ($clausearray as $valweight) { - $val = $valweight[0]; - $weight = $valweight[1]; + $val = $valweight[0]; // e.g. onephrase, and, or, emstartswith, compressed, exactmatcher, lcnormalized, stdnum + $weight = $valweight[1]; // e.g. Null, 100, 60 + + + // lcnormalized and stdnum are not fields generated by build_and_or_onephrase, + // they will be added here and they are already normalized if (!isset($values[$val])) { if ($val == 'lcnormalized') { $LC = LCCallNumberNormalizer::singleton(); @@ -664,18 +815,62 @@ private function __buildQueryString($structure, $values, $joiner = "OR") { } if ($val == 'stdnum') { + // Extract standard number from asis input + // Strips leading 0s. Captures digits, dashes, dots: 978-0-123-45678-9 + // e.g. 0000978-0-12-345678-9 → 978-0-12-345678-9 if (preg_match('/^\s*0*([\d\-\.]+[xX]?).*$/', $values['asis'], $match)) { $stdnum = $match[1]; -// $stdnum = preg_replace('/[\.\-]/', '', $stdnum); $stdnum = Normalize::stdnum($stdnum); $values[$val] = $stdnum; } } } + if (!isset($values[$val]) || ($values[$val] == "")) { continue; } - $sstring = $field . ':(' . $values[$val] . ')'; + + // Lianet's notes: Escapes based on semantic role. It is safe embedding in field:value syntax + switch ($val) { + + case 'lcnormalized': + $escaped_value = $values[$val]; + break; + case 'stdnum': + // 978-0-123-45678-9 + $escaped_value = $values[$val]; + break; + case 'onephrase': + // "\"dramatic literature, comprehending critical\"" + $escaped_value = $this->escapePhrase($values[$val]); + break; + case 'and': + case 'or': + // and -- dramatic AND literature, AND comprehending AND critical + // or -- dramatic OR literature, OR comprehending OR critical + // \"dramatic AND literature\, AND comprehending AND critical\" + // See how $values[$var] looks war AND and AND peace AND war AND and AND rest + + $escaped_value = $this->escapeBoolean($values[$val]); + break; + case 'emstartswith': + // dramaticliteraturecomprehendingcritical* + $escaped_value = $this->escapePrefix($values[$val]); + break; + case 'compressed': + // dramaticliteraturecomprehendingcritical (no spaces) + $escaped_value = $this->escapeTerm($values[$val]); + break; + default: + // exactmatcher - dramaticliteraturecomprehendingcritical + $escaped_value = $this->escapeTerm($values[$val]); + } + + if ($escaped_value === '""' || $escaped_value === '') { + continue; + } + + $sstring = $field . ':(' . $escaped_value . ')'; if (isset($weight) && $weight > 0) { $sstring .= '^' . $weight; } @@ -688,7 +883,7 @@ private function __buildQueryString($structure, $values, $joiner = "OR") { /** - * Turn solr output into a record structure (which shouuld probably be its own class...) + * Turn solr output into a record structure (which should probably be its own class...) * * @param string $result The XML returned by solr * @param string $xslfile The path of the XSL file to use to convert the data @@ -700,6 +895,7 @@ function _process($result, $xslfile = 'xsl/solr-convert.xsl') { global $configArray; if (is_string($result) && preg_match('/^filter((array)$record['availability']); } } - - // print_r($res); - return $res; } @@ -757,9 +950,13 @@ function _process($result, $xslfile = 'xsl/solr-convert.xsl') { * @return array Tokenized array * @access public */ + // TODO: refactor tokenizer to a single-pass parser instead of regex public function tokenizeInput($input) { // Tokenize on spaces and quotes //preg_match_all('/"[^"]*"|[^ ]+/', $input, $words); + // /"[^"]*"[~[0-9]+]* --> to capture fuzzy searches like "hello world"~5 - matches a double-quoted string followed by ~ and a number + // "[^"]*" --> to capture exact phrases like "hello world" - matches a double-quoted string + // [^ ]+ --> to capture single words like hello - matches sequences of non-space characters preg_match_all('/"[^"]*"[~[0-9]+]*|"[^"]*"|[^ ]+/', $input, $words); $words = $words[0]; @@ -787,45 +984,255 @@ public function tokenizeInput($input) { // continue; // } $fixedwords[] = $word; -// error_log("Token is $word"); } - // return $newWords; return $fixedwords; } + + /** + * Remove leading wildcards from input + * Ensure wildcards are not at beginning of input + * Before using this function you should check if there is Use this wildcards to remove, otherwise it + * will remove always the first character + * Performance guard, not a security guard. Prevent expensive queries (*table, ?table) + + * @param string $input User's input string + * @return string Input string without leading wildcards + * @access public + */ + public function remove_first_character($input) { + return substr($input, 1); + } + + /** + * Remove parentheses from input + * Use this function if you want to remove parentheses from input + * It is used if there is unbalanced parentheses in the input + * Prevents Solr parser errors. Deletes all parentheses instead of fixing structure + + * @param string $input User's input string + * @return string Input string without parentheses + * @access public + */ + public function remove_parentheses($input) { + return str_replace(array('(', ')'), '', $input); + } + + /** + * Remove wrapping double quotes from a string, if present. + * + * Examples: + * - '"table"' → table + * - ' "table" ' → table + * - '"table name"' → table name + * - 'table "name"' → table "name" + * - '"table"name"' → table"name + * + * @param string $s + * @return string + */ + public function remove_quotes(string $s): string { + $s = trim($s); + + if (mb_strlen($s) >= 2 && $s[0] === '"' && substr($s, -1) === '"') { + return substr($s, 1, -1); + } + + return $s; + } + + /** + * Remove invalid caret (^) usage from input + * Ensure ^ is used properly - Prevent invalid syntax as table^, table^abc + * Use this function if there is invalid caret usage in the input + * @param string $input User's input string + * @return string Input string without invalid caret usage + * @access public + */ + public function remove_invalid_caret_usage($input) { + return str_replace('^', '', $input); + } + + /** + * If input matches the pattern: "phrase"*, + * return phrase* (quotes removed, wildcard preserved). + * Otherwise return null. + + * @param string $input User's input string + * @return string|null Unwrapped quoted wildcard or null + * @access public + */ + public function unwrapQuotedWildcard(string $input): ?string { + // Match: optional whitespace + "..." + * + optional whitespace + // ^\s* --> leading whitespace + // " --> opening quote + // ([^"]+) --> capture group for any characters except quotes (the phrase) + // " --> closing quote + // \* --> literal asterisk + // \s*$ --> trailing whitespace + if (preg_match('/^\s*"([^"]+)"\*\s*$/u', $input, $matches)) { + return $matches[1] . '*'; + } + + return null; + } + /** * Input Validater * - * Cleanes the input based on the Lucene Syntax rules. - * - * @param string $input User's input string - * @return string Fixed input + * Validate the user input for Solr queries. + * This function is effective if: + * It is used before building the query + * It runs before escaping + * It rejects invalid syntax instead of trying to fix it + * Escaping is done after validation + * This validator: + * - Rejects empty input or garbage-only input + * - Rejects meaningless single-character input (~, \) + * - Rejects leading wildcards (*, ?) + * - Validates balanced parentheses and quotes + * - Validates boost syntax (^number) + * - Validates fuzzy operators (~N) + * - Validates fielded queries (field:value) + * - Rejects empty boolean groups + * + * @param string $input Raw user input + * @return array{valid: bool, error?: string} Validation result * @access public */ + // TODO: Add the rule: Reject fuzzy operators like "~2" + // Lianet's notes: Verify if this function could be used to validate the Solr query public function validateInput($input) { + + // 1. Normalize + trim + $trimmed = trim($input); + + //print("Input --- : " . json_encode($trimmed, JSON_UNESCAPED_UNICODE)); + + // 2. Empty input + if ($trimmed === '') { + return ['valid' => false, 'error' => 'Empty query']; + } + + // 3. Strip garbage-only input ~~//^&$ (no letters or numbers) + if ($trimmed !== '' && !preg_match('/[\p{L}\p{N}]/u', $trimmed)) { + return ['valid' => false, 'error' => 'Invalid garbage-only query']; + } + + // 4. Reject meaningless single-character input (~ or \) + if (mb_strlen($trimmed) === 1 && preg_match('/^[~\\\\]$/', $trimmed)) { + return ['valid' => false, 'error' => 'Invalid single-character query']; + } + + // 5. No leading wildcard // Ensure wildcards are not at beginning of input - if ((substr($input, 0, 1) == '*') || - (substr($input, 0, 1) == '?')) { - return substr($input, 1); + // Performance guard, not a security guard. Prevent expensive queries (*table, ?table) + if ($trimmed[0] === '*' || $trimmed[0] === '?') { + return ['valid' => false, 'error' => 'Leading wildcard not allowed']; + } + + // 6. Balanced parentheses + // Ensure all parens match - parentheses balancing + // Prevents Solr parser errors. Deletes all parentheses instead of fixing structure + if (substr_count($trimmed, '(') !== substr_count($trimmed, ')')) { + return ['valid' => false, 'error' => 'Unbalanced parentheses']; + } + // 7. Balanced quotes + if (substr_count($trimmed, '"') % 2 !== 0) { + return ['valid' => false, 'error' => 'Unbalanced quotes']; } - // Ensure all parens match - $start = preg_match_all('/\(/', $input, $tmp); - $end = preg_match_all('/\)/', $input, $tmp); - if ($start != $end) { - return str_replace(array('(', ')'), '', $input); + // 8. Valid boost syntax (^number or ^number.number) + // Ensure ^ is used properly - Prevent invalid syntax as table^, table^abc + // Rejects invalid boosts (^, ^abc) + if (preg_match_all('/\^([^\s]+)/', $trimmed, $matches)) { + foreach ($matches[1] as $boost) { + if (!preg_match('/^[0-9]+(\.[0-9]+)?$/', $boost)) { + return ['valid' => false, 'error' => 'Invalid boost syntax']; + } + } + } + + // 9. Reject multiple boosts on the same term (table^2^3) + if (preg_match('/\^[0-9]+(\.[0-9]+)?\s*\^/', $trimmed)) { + return ['valid' => false, 'error' => 'Multiple boosts on same term']; } - // Ensure ^ is used properly - $cnt = preg_match_all('/\^/', $input, $tmp); - $matches = preg_match_all('/.+\^[0-9]/', $input, $tmp); + // 10. Reject dangling boost operator like table^ + if (preg_match('/\^\s*(\)|$)/', $trimmed)) { + return ['valid' => false, 'error' => 'Dangling boost operator']; + } + + // --------------------- + // 11. Fuzzy operator validation + // --------------------- - if (($cnt) && ($cnt !== $matches)) { - return str_replace('^', '', $input); + // Rules: + // - ~ must be followed by a non-negative integer + // - ~ cannot be doubled (~~) + // - ~ must not be followed by letters + // - ~ must not appear inside field: without a term + // - standalone ~2 IS allowed (syntactically valid Lucene) + + // 11.1 Reject repeated fuzzy operators like table~~2 + if (preg_match('/~~+/', $trimmed)) { + return ['valid' => false, 'error' => 'Repeated fuzzy operator']; } - return $input; + // 11.2 Reject fuzzy operators with non-numeric distance (table~abc, "foo"~x) + if (preg_match('/~(?!\d+\b)/', $trimmed)) { + return ['valid' => false, 'error' => 'Invalid fuzzy syntax']; + } + + // 11.3 Reject fielded fuzzy with no term: title:~2 + if (preg_match('/\b[\w\-]+:\s*~\d+\b/', $trimmed)) { + return ['valid' => false, 'error' => 'Fuzzy operator without term']; + } + + // --------------------- + // 12. Field validation + // --------------------- + + // 12.1 Reject empty field groups like title:( ) + if (preg_match('/\b[\w\-]+:\(\s*\)/', $trimmed)) { + return ['valid' => false, 'error' => 'Empty field group']; + } + + // 12.2 Reject empty field values like title: + if (preg_match('/\b[\w\-]+:\s*(\)|$)/', $trimmed)) { + return ['valid' => false, 'error' => 'Empty field value']; + } + + // 12.3. Fielded query validation (field:value) + // Rejects malformed field queries (title:, :table, title::table) + if (preg_match_all('/(\b[\w\-]+):/', $trimmed, $fields)) { + foreach ($fields[1] as $field) { + // Reject empty field names (shouldn't happen due to regex) + if ($field === '') { + return ['valid' => false, 'error' => 'Empty field name']; + } + } + } + + // 13. Reject dangling colons + if (preg_match('/(^|[^\\w]):|::/', $trimmed)) { + return ['valid' => false, 'error' => 'Malformed field:value syntax']; + } + + // 14. Reject empty boolean groups inside parentheses + // Rejects empty boolean groups ((AND), (OR NOT)) + if (preg_match('/\((\s*(AND|OR|NOT)\s*)+\)/i', $trimmed)) { + return ['valid' => false, 'error' => 'Empty boolean group']; + } + + // 15. Reject fielded queries ending with boolean operators + if (preg_match('/\b[\w\-]+:\([^)]*(AND|OR|NOT)\s*\)/i', $trimmed)) { + return ['valid' => false, 'error' => 'Incomplete boolean expression in field']; + } + + return ['valid' => true]; + } /** @@ -857,39 +1264,136 @@ function exactmatcherify($str) { return $str; } + /** + * Converts any invalid Lucene input into a safe, terms-only query. + * - Removes all Lucene syntax + * - Preserves letters and numbers only + * - Used only when intent is recoverable but syntax is unsafe + */ + public function sanitizeToTerms(string $input): string + { + // 1. Normalize whitespace + $s = trim($input); + + // 2. Replace all non-letter/non-number with space + // \p{L} = any Unicode letter + // \p{N} = any Unicode number + $s = preg_replace('/[^\p{L}\p{N}]+/u', ' ', $s); + + // 3. Collapse multiple spaces + $s = preg_replace('/\s+/u', ' ', $s); + + // 4. Trim again + $s = trim($s); + + return $s; + } /** * Build AND, OR, and Phrase queries * * Given a lookfor string, clean it up, tokenize it, and * return a structure that includes AND, OR, and Phrase - * queries. + * queries. lookfor could be single or multi-word * * @param string $lookfor User's search string * @return array $values Includes 'and', 'or', and 'onephrase' elements * @access public */ - + // Lianet's notes: Check if is necessary to remove illegal characters + // TODO: Refactoring this function to avoid the different output public function build_and_or_onephrase($lookfor = null) { $values = array(); - $illegal = array('.', '{', '}', '/', '!', ':', ';', '[', ']', '(', ')', '+ ', '&', '- '); - $lookfor = trim(str_replace($illegal, '', $lookfor)); + //$illegal = array('.', '{', '}', '/', '!', ':', ';', '[', ']', '(', ')', '+ ', '&', '- '); + //$lookfor = trim(str_replace($illegal, '', $lookfor)); + // Replace fancy quotes $lookfor = str_replace(array('“', '”'), '"', $lookfor); - // If it looks like "..."*, pull out the quotes - if (preg_match('/^\s*"(.*)"\*\s*$/', $lookfor, $match)) { - $em = $match[1]; - $lookfor = $em . '*'; - // $em = $this->exactmatcherify($em) . '*'; - // return array('exactmatcher' => $em, 'emstartswith' => $em, 'asis' => $lookfor); + // If it looks like "..."*, pull out the quotes + $unwrapped = $this->unwrapQuotedWildcard($lookfor); + if ($unwrapped !== null) { + $lookfor = $unwrapped; } - // Validate input - $lookfor = $this->validateInput($lookfor); + // Handles multiple issue in the query that make it invalid + // Each pass fix one issue + // $maxPasses is to prevent infinity loops + // Loop continue until query is valid or sanitized + $maxPasses = 3; + $pass = 0; + + do { + + $validation = $this->validateInput($lookfor); + + //print("Validation Result --- : " . json_encode($validation, JSON_UNESCAPED_UNICODE)); + + + if ($validation['valid']) { + break; + } + + // Considering the logic of updating the user input query as the application is doing now + + switch ($validation['error']) { + + // ------------------------- + // HARD FALLBACK (*:*) + // ------------------------- + case 'Empty query': + case 'Invalid garbage-only query': + case 'Invalid single-character query': + return false; + + // ------------------------- + // STRUCTURAL FIXES + // ------------------------- + + case 'Leading wildcard not allowed': + $lookfor = $this->remove_first_character($lookfor); + break; + case 'Unbalanced parentheses': + $lookfor = $this->remove_parentheses($lookfor); + break; + case 'Unbalanced quotes': + $lookfor = $this->remove_quotes($lookfor); + break; + + case 'Invalid boost syntax': + case 'Multiple boosts on same term': + case 'Dangling boost operator': + $lookfor = $this->remove_invalid_caret_usage($lookfor); + break; + + // ------------------------- + // SANITIZE TO TERMS-ONLY + // ------------------------- + case 'Repeated fuzzy operator': + case 'Invalid fuzzy syntax': + case 'Fuzzy operator without term': + case 'Empty field group': + case 'Empty field value': + case 'Empty field name': + case 'Malformed field:value syntax': + case 'Dangling colon': + $lookfor = $this->sanitizeToTerms($lookfor); + break; + + // ------------------------- + // SAFETY NET + // ------------------------- + default: + $lookfor = $this->sanitizeToTerms($lookfor); + break; + } + // print_r("SANITIZE Input string: " . $lookfor); + // print_r("Count pass" . $pass); + $pass++; + } while ($pass < $maxPasses); if (!preg_match('/\S/', $lookfor)) { return false; @@ -898,13 +1402,21 @@ public function build_and_or_onephrase($lookfor = null) { // Tokenize Input $tokenized = $this->tokenizeInput($lookfor); - $values['onephrase'] = '"' . preg_replace('/"/', '', implode(' ', $tokenized)) . '"'; + // Phrase search - "dramatic literature, comprehending critical" + $values['onephrase'] = preg_replace('/"/', '', implode(' ', $tokenized)); + // AND search - dramatic AND literature, AND comprehending AND critical $values['and'] = implode(' AND ', $tokenized); + // OR search - dramatic OR literature, OR comprehending OR critical $values['or'] = implode(' OR ', $tokenized); + // As-is search - dramatic literature, comprehending critical $values['asis'] = $lookfor; + // Compressed search - dramaticliterature,comprehendingcritical $values['compressed'] = preg_replace('/\s/', '', $lookfor); + // Exactmatcher search - dramaticliteraturecomprehendingcritical $values['exactmatcher'] = $this->exactmatcherify($lookfor); + // Exactmatcher startswith search - dramaticliteraturecomprehendingcritical* $values['emstartswith'] = $values['exactmatcher'] . '*'; + return $values; } @@ -918,7 +1430,7 @@ public function build_and_or_onephrase($lookfor = null) { **/ function solrSearch($args, $action = 'standard') { - $raw = $this->rawSolrSearch($args, $action); + $raw = $this->rawSolrSearch($args, $action); // This is the Solr output if (!PEAR::isError($raw)) { $processed = $this->_process($raw); @@ -956,6 +1468,8 @@ protected function set_proper_action($action, $args) { return $action; } } + // Ensure a non-NULL return from non-edismax cases + // return $action; } // Do we just want the IDs? Spit 'em out! @@ -1012,6 +1526,7 @@ function rawSolrSearch($args, $action = 'standard') { die(); } + # Finally, we can deal with the normal case return $this->solr_connection->send(); } @@ -1117,11 +1632,40 @@ function mltesc($str) { } - function lucene_escape($str) { - $pattern = '/(\+|-|&&|\|\||!|\(|\)|\{|}|\[|]|\^|"|~|\*|\?|:|\\\)/'; - $replace = '\\\$1'; - return preg_replace($pattern, $replace, $str); - } + /** + * Strict escape for filter, facets and ranges values and explicit field:value fragments. + + * Use for all fq values and any time you construct field:value with user data. + * This function: + * - Normalizes Unicode to NFC form + * - Removes control characters + * - Escapes backslash FIRST (critical ordering) + * - Escapes multi-char tokens (&&, ||) + * - Escapes all Lucene special characters: + - ! ( ) { } [ ] ^ " ~ * ? : / + * + * @param string $s Raw user input or value to escape + * @return string Safely escaped value for use in Solr fq or field:value + */ + public function lucene_escape_fq(string $s): string { + // Normalize Unicode to composed form (NFC) + if (function_exists('normalizer_normalize')) { + $s = normalizer_normalize($s, Normalizer::FORM_C) ?: $s; + } + + // Remove control characters (0x00-0x1F, 0x7F) + $s = preg_replace('/[\x00-\x1F\x7F]/u', '', $s); + + // Escape backslash FIRST to avoid double-escaping + $s = str_replace('\\', '\\\\', $s); + + // Use regex to catch all specials, including spaces, in one go + // The characters are: + - && || ! ( ) { } [ ] ^ " ~ * ? : / + // Note: && and || are handled as single chars & and | here + $pattern = '/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\/\&\|])/'; + + // '\\\\$1' → runtime '\\$1' → Lucene '\' + return preg_replace($pattern, '\\\\$1', $s); + } function getMoreLikeThis($record, $id, $max = 5) { global $configArray; @@ -1134,26 +1678,26 @@ function getMoreLikeThis($record, $id, $max = 5) { return null; } - $query = '(title:("' . $this->lucene_escape($this->mltesc($record['title'][0])) . '")^75'; + $query = '(title:("' . $this->lucene_escape_fq($this->mltesc($record['title'][0])) . '")^75'; if (isset($record['shorttitle'])) { - $query .= ' OR title:("' . $this->lucene_escape($this->mltesc($record['title'][0])) . '")^100'; + $query .= ' OR title:("' . $this->lucene_escape_fq($this->mltesc($record['title'][0])) . '")^100'; } if (isset($record['fulltopic'])) { foreach ($record['fulltopic'] as $topic) { - $query .= ' OR fulltopic:("' . $this->lucene_escape($this->mltesc($topic)) . '")^300'; + $query .= ' OR fulltopic:("' . $this->lucene_escape_fq($this->mltesc($topic)) . '")^300'; } } if (isset($record['language'])) { foreach ($record['language'] as $language) { - $query .= ' OR language:("' . $this->lucene_escape($this->mltesc($language)) . '")^30'; + $query .= ' OR language:("' . $this->lucene_escape_fq($this->mltesc($language)) . '")^30'; } } if (isset($record['author'])) { foreach ($record['author'] as $author) { - $query .= ' OR author:("' . $this->lucene_escape($this->mltesc($author)) . '")^75'; + $query .= ' OR author:("' . $this->lucene_escape_fq($this->mltesc($author)) . '")^75'; } } @@ -1172,7 +1716,7 @@ function getMoreLikeThis($record, $id, $max = 5) { $query .= ') NOT id:(' . $id . ')'; $ss = new SearchStructure(true); // create a "blank" ss with just the filter queries - + $args = array_merge(array(array('q', $query)), $this->filterComponents($ss)); return $this->solrSearch($args); } diff --git a/sys/SolrConnection.php b/sys/SolrConnection.php index 48c0e113..57b64ee9 100644 --- a/sys/SolrConnection.php +++ b/sys/SolrConnection.php @@ -66,6 +66,7 @@ public function send($args = []) { return json_decode($raw, true); } + // TODO: Detete it - Nothing uses it public function send_for_obj($args = []) { $raw = $this->_send($args); return json_decode($raw, false); diff --git a/test/SearchStructureTest.php b/test/SearchStructureTest.php index 15501860..8ec9f59c 100644 --- a/test/SearchStructureTest.php +++ b/test/SearchStructureTest.php @@ -25,7 +25,14 @@ public function test_searchtermsForDisplay(): void $_REQUEST['lookfor'] = ['']; $ss = new SearchStructure; $this->assertEquals(['All Fields: <b>'], $ss->searchtermsForDisplay()); - } + + # Check the function fix_unbalanced_quotes + $this->assertEquals('norfolk', $ss->fix_unbalanced_quotes('norfolk')); // 0 quotes, balanced: + $this->assertEquals('norfolk', $ss->fix_unbalanced_quotes('"norfolk')); // 1 quote, unbalanced: + $this->assertEquals('"norfolk"', $ss->fix_unbalanced_quotes('"norfolk"')); // 2 quotes, balanced: + $this->assertEquals('"norfolk"', $ss->fix_unbalanced_quotes('"norfolk""')); // 3 quotes, unbalanced: + + } } ?> diff --git a/test/SolrQueryTest/BuildAndOrOnePhraseTest.php b/test/SolrQueryTest/BuildAndOrOnePhraseTest.php new file mode 100644 index 00000000..f9fb0d58 --- /dev/null +++ b/test/SolrQueryTest/BuildAndOrOnePhraseTest.php @@ -0,0 +1,147 @@ +solr = new Solr('', ''); + } + + /** + * @covers Solr::build_and_or_onephrase + * ~ --> reject, then return False + */ + public function testRejectsSingleTilde() + { + $this->assertFalse( + $this->solr->build_and_or_onephrase('~'), + 'Single tilde should be rejected' + ); + } + + /** + * @covers Solr::build_and_or_onephrase + */ + public function testUnbalanceFuncyQuote(): void + { + $result = $this->solr->build_and_or_onephrase('“smart'); + + $this->assertIsArray($result); + $this->assertEquals('smart', $result['onephrase']); + $this->assertEquals('"smart', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + * \\ --> reject, then return False + */ + public function testRejectsSingleBackslash() + { + $this->assertFalse( + $this->solr->build_and_or_onephrase('\\'), + 'Single backslash should be rejected' + ); + } + + /** + * @covers Solr::build_and_or_onephrase + * table~2 --> accepted fuzzy search, then create the query + */ + public function testAllowsFuzzyTerm() + { + $result = $this->solr->build_and_or_onephrase('table~2'); + + $this->assertIsArray($result); + $this->assertEquals('table~2', $result['onephrase']); + $this->assertEquals('table~2', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + * "table"~2 --> accepted fuzzy search, then create the query + */ + public function testAllowsQuotedFuzzyPhrase() + { + $result = $this->solr->build_and_or_onephrase('"table"~2'); + + $this->assertIsArray($result); + $this->assertEquals('table~2', $result['onephrase']); + $this->assertEquals('"table"~2', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + * "table" --> accepted one word query, then create the query + */ + public function testAllowsNormalTerm() + { + $result = $this->solr->build_and_or_onephrase('table'); + + $this->assertIsArray($result); + $this->assertEquals('table', $result['onephrase']); + $this->assertEquals('table', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + * * --> accepted wildcard search, then create the query + */ + public function testAllowsWildcardTerm() + { + $result = $this->solr->build_and_or_onephrase('table*'); + + $this->assertIsArray($result); + $this->assertEquals('table*', $result['onephrase']); + $this->assertEquals('table*', $result['asis']); + } + + /** + * @covers Solr::build_and_or_onephrase + */ + public function testMultipleValidationErrorsAreHandledIteratively(): void + { + + // Two errors: + // 1) Unbalanced parentheses + // 2) Dangling boost operator (^) + $input = '(title:(war^peace'; + + $result = $this->solr->build_and_or_onephrase($input); + + $this->assertIsArray($result); + + // Parentheses should be removed + $this->assertStringNotContainsString('(', $result['asis']); + $this->assertStringNotContainsString(')', $result['asis']); + + // Invalid boost should be removed + $this->assertStringNotContainsString('^', $result['asis']); + + // Core terms must survive + $this->assertStringContainsString('war', $result['asis']); + $this->assertStringContainsString('peace', $result['asis']); + + // Final result must now be valid + $validation = $this->solr->validateInput($result['asis']); + $this->assertTrue($validation['valid'], 'Final query should be valid after iterative fixes'); + } + +} +?> \ No newline at end of file diff --git a/test/SolrQueryTest/SolrEscapingTest.php b/test/SolrQueryTest/SolrEscapingTest.php new file mode 100644 index 00000000..85d5cced --- /dev/null +++ b/test/SolrQueryTest/SolrEscapingTest.php @@ -0,0 +1,231 @@ +lucene_escape_fq($input); + // Should be: \\ (escaped backslash) + \&\& (escaped &&) + $this->assertEquals('\\\\\\&\\&', $result); + } + + /** + * Test all 19 special characters individually + * @covers Solr::lucene_escape_fq + */ + public function test_all_special_chars(): void + { + $solr = new Solr('', ''); + + $specials = [ + ['\\', '\\\\'], + ['&&', '\\&\\&'], + ['||', '\\|\\|'], + ['+', '\\+'], + ['-', '\\-'], + ['!', '\\!'], + ['(', '\\('], + [')', '\\)'], + ['{', '\\{'], + ['}', '\\}'], + ['[', '\\['], + [']', '\\]'], + ['^', '\\^'], + ['"', '\\"'], + ['~', '\\~'], + ['*', '\\*'], + ['?', '\\?'], + [':', '\\:'], + ['/', '\\/'] + ]; + + foreach ($specials as [$input, $expected]) { + $this->assertEquals($expected, $solr->lucene_escape_fq($input), + "Failed to escape: $input"); + } + } + + /** + * Test production failure cases from error logs + * @covers Solr::lucene_escape_fq + */ + public function test_production_failure_cases(): void + { + $solr = new Solr('', ''); + + $cases = [ + // From actual error logs + ['\\', '\\\\'], + ['"\\', '\\"\\\\'], // Quote + backslash + ['C:\\Program Files', 'C\\:\\\\Program Files'], + ['foo~bar', 'foo\\~bar'], + ['{!term}', '\\{\\!term\\}'], + ]; + + foreach ($cases as [$input, $expected]) { + $this->assertEquals($expected, $solr->lucene_escape_fq($input), + "Failed production case: $input"); + } + } + + /** + * Test that empty string is preserved + * @covers Solr::lucene_escape_fq + */ + public function test_empty_string(): void + { + $solr = new Solr('', ''); + $this->assertEquals('', $solr->lucene_escape_fq('')); + } + + /** + * Test normal text is not modified + * @covers Solr::lucene_escape_fq + */ + public function test_normal_text_unchanged(): void + { + $solr = new Solr('', ''); + $this->assertEquals('hello world', $solr->lucene_escape_fq('hello world')); + $this->assertEquals('abc123', $solr->lucene_escape_fq('abc123')); + } + + /** + * Helper function to test phrase + fuzzy escaping + * @covers Solr::escapePhrase + */ + public function testPhraseWithFuzzyIsPreserved(): void + { + + $solr = new Solr('', ''); + + $this->assertEquals( + '"machine learning\\"~3"', + $solr->escapePhrase('machine learning"~3') + ); + } + + /** + * Helper function to test phrase + fuzzy escaping + * @covers Solr::escapePhrase + */ + public function testPhraseWithCharactersToEscape(): void + { + // Applying escapePhrase() to already-quoted input must produce exactly one quoted phrase. + $solr = new Solr('', ''); + + $input = '"nature, and history"'; + + $escapedOnce = $solr->escapePhrase($input); + $escapedTwice = $solr->escapePhrase($escapedOnce); + + $this->assertSame( + '"nature, and history"', + $escapedOnce, + 'escapePhrase should preserve a single quoted phrase' + ); + + $this->assertSame( + $escapedOnce, + $escapedTwice, + 'escapePhrase must be idempotent' + ); + + $input = '"nature, "and" history \ test"'; + + $escapedPhrase = $solr->escapePhrase($input); + + $this->assertSame( + '"nature, \"and\" history \\\\ test"', + $escapedPhrase, + 'escapePhrase should preserve a single quoted phrase and escape to avoid syntax error' + ); + + + } + + /** + * Helper function to test boolean operators within phrases preserved + * @covers Solr::escapeBoolean + */ + public function testBooleanEscaping(): void + { + // Boolean operators must remain unescaped; operands must never introduce Solr syntax errors. + $solr = new Solr('', ''); + $this->assertEquals( + 'dramatic AND literature', + $solr->escapeBoolean('dramatic AND literature') + ); + + $input = 'nature, AND and AND history'; + + $escaped = $solr->escapeBoolean($input); + + // Ensure no accidental escaping of operators + $this->assertStringContainsString(' AND ', $escaped); + + $this->assertSame( + 'nature, AND and AND history', + $escaped, + 'Boolean operators must be preserved and operands safely escaped' + ); + + // Ensure no quote imbalance + $this->assertSame( + 0, + substr_count($escaped, '"') % 2, + 'Escaped boolean expression must have balanced quotes' + ); + + // Hard safety check: no raw special chars that break Solr + $this->assertDoesNotMatchRegularExpression( + '/(?assertEquals( + 'machine*', + $solr->escapePrefix('machine*') + ); + } + + /** + * Helper function to test prefix preserved, not escaped away + * @covers Solr::escapeTerm + */ + public function testFieldInjectionBlocked() { + + $solr = new Solr('', ''); + $escaped = $solr->escapeTerm('title:evil'); + $this->assertEquals('title\\:evil', $escaped); + } + +} +?> \ No newline at end of file diff --git a/test/SolrQueryTest/SolrQueryFullPipelineTest.php b/test/SolrQueryTest/SolrQueryFullPipelineTest.php new file mode 100644 index 00000000..e7b8e48b --- /dev/null +++ b/test/SolrQueryTest/SolrQueryFullPipelineTest.php @@ -0,0 +1,99 @@ +assertNotEmpty($ss->search); + $this->assertEquals('title', $ss->search[0][0]); + $this->assertEquals("\\", $ss->search[0][1]); + + // Check cleaned_up_original_search + $this->assertNotEmpty($ss->cleaned_up_original_search); + $this->assertEquals('title', $ss->cleaned_up_original_search[0][0]); + $this->assertEquals('\\', $ss->cleaned_up_original_search[0][1]); + + // Verify backslash is preserved (not removed or double-escaped) + $this->assertStringContainsString('\\', $ss->search[0][1]); + + // ---------------test_backslash_through_full_pipeline--------------- + + // 1. Start with raw input + $rawInput = '\\'; + // fwrite(STDOUT, "1. Raw input: '$rawInput' (len=" . strlen($rawInput) . ")\n"); + + // 2. Through SearchStructure + $afterSS = $ss->search[0][1]; + // fwrite(STDOUT, "2. After SearchStructure: '$afterSS' (len=" . strlen($afterSS) . ")\n"); + + // 3. Through Solr (dismax) + + $solr = new Solr('', ''); + $args = $solr->dismaxSearchArguments($ss); + // args[0][0] is q + + // As \ is an invalid query, Solr should convert it to *:* + $this->assertEquals('q', $args[0][0], "Solr args field should be 'q'"); + $this->assertEquals('*:*', $args[0][1], "Solr args query should be '*:*' for invalid input"); + + $this->assertEquals('smart', $ss->fix_unbalanced_quotes('“smart')); // fancy quote are fixed in SearchStructure: + + $_REQUEST['lookfor'] = ['“smart']; // + $ss = new SearchStructure; + // Check that unbalanced fancy quote is fixed + $this->assertEquals('smart', $ss->search[0][1], "Unbalanced fancy quote are fixed in SearchStructure"); + + // dismaxSearchArguments receives the input query with unbalanced fancy quote + $solr = new Solr('', ''); + $args = $solr->dismaxSearchArguments($ss); + + $this->assertStringContainsString('smart', $args[0][1], "Solr.php receives fixed query without fancy quotes"); + // In $args[0][1] is expected the Solr query + // Required clauses (order-independent) + // Test example considering title query + $expectedClauses = [ + 'title_ab:(smart)^25000', // exactmatcher + 'title_a:(smart)^15000', // exactmatcher + 'titleProper:(smart*)^8000', // emstartswith + + 'titleProper:("smart")^1200', // onephrase field + + 'title_topProper:("smart")^600', // onephrase field + + 'series2:("smart")^500' // onephrase field + ]; + + foreach ($expectedClauses as $clause) { + $this->assertStringContainsString( + $clause, + $args[0][1], + "Missing expected Solr clause: {$clause}\n\nActual query:\n{$args[0][1]}" + ); + } + } + +} + +?> \ No newline at end of file diff --git a/test/SolrQueryTest/TokenizeInputTest.php b/test/SolrQueryTest/TokenizeInputTest.php new file mode 100644 index 00000000..5d0b0b53 --- /dev/null +++ b/test/SolrQueryTest/TokenizeInputTest.php @@ -0,0 +1,124 @@ +solr = new Solr('', ''); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesSimpleWords() + { + $tokens = $this->solr->tokenizeInput('table chair'); + + $this->assertSame( + ['table', 'chair'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesQuotedPhrase() + { + $tokens = $this->solr->tokenizeInput('"table chair"'); + + $this->assertSame( + ['"table chair"'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesFuzzyPhrase() + { + $tokens = $this->solr->tokenizeInput('"table chair"~2'); + + $this->assertSame( + ['"table chair"~2'], + $tokens, + 'Fuzzy phrase must be a single token' + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesUnquotedFuzzyTerm() + { + $tokens = $this->solr->tokenizeInput('table~2'); + + $this->assertSame( + ['table~2'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesMixedInput() + { + $tokens = $this->solr->tokenizeInput('table "chair leg"~3 desk'); + + $this->assertSame( + ['table', '"chair leg"~3', 'desk'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesBooleanOperators() + { + $tokens = $this->solr->tokenizeInput('table AND "chair leg"~2'); + + $this->assertSame( + ['table AND "chair leg"~2'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testTokenizesMultipleFuzzyPhrases() + { + $tokens = $this->solr->tokenizeInput('"table chair"~2 "wood table"~1'); + + $this->assertSame( + ['"table chair"~2', '"wood table"~1'], + $tokens + ); + } + + /** + * @covers Solr::tokenizeInput + */ + public function testDoesNotSplitQuotedPhraseWithSpaces() + { + $tokens = $this->solr->tokenizeInput('"a b c"~4'); + + $this->assertSame( + ['"a b c"~4'], + $tokens + ); + } +} +?> \ No newline at end of file diff --git a/test/SolrQueryTest/ValidateQueryInputTest.php b/test/SolrQueryTest/ValidateQueryInputTest.php new file mode 100644 index 00000000..9ae2f5f4 --- /dev/null +++ b/test/SolrQueryTest/ValidateQueryInputTest.php @@ -0,0 +1,352 @@ +solr = new Solr('', ''); + } + + /* ============================================================ + * remove_wildcards_add_beginning() + * ============================================================ + */ + + /** + * @covers Solr::remove_first_character + */ + public function testRemovesLeadingWildcard(): void + { + $this->assertSame( + 'table', + $this->solr->remove_first_character('*table') + ); + + $this->assertSame( + 'table', + $this->solr->remove_first_character('?table') + ); + } + + /** + * @covers Solr::remove_first_character + */ + public function testRemoveWildcardBlindlyRemovesFirstChar(): void + { + // This documents current behavior (important!) + $this->assertSame( + 'able', + $this->solr->remove_first_character('table') + ); + } + + /* ============================================================ + * remove_unbalanced_parentheses() + * ============================================================ + */ + + /** + * @covers Solr::remove_parentheses + */ + public function testRemovesAllParentheses(): void + { + $this->assertSame( + 'table test', + $this->solr->remove_parentheses('(table) (test)') + ); + } + + /** + * @covers Solr::remove_parentheses + */ + public function testRemovesNestedParentheses(): void + { + $this->assertSame( + 'table test', + $this->solr->remove_parentheses('((table) test)') + ); + } + + /* ============================================================ + * remove_invalid_caret_usage() + * ============================================================ + */ + + /** + * @covers Solr::remove_invalid_caret_usage + */ + public function testRemovesAllCarets(): void + { + $this->assertSame( + 'table2', + $this->solr->remove_invalid_caret_usage('table^2') + ); + + $this->assertSame( + 'tableabc', + $this->solr->remove_invalid_caret_usage('table^abc') + ); + } + + /* ============================================================ + * unwrapQuotedWildcard() + * ============================================================ + */ + + /** + * @covers Solr::unwrapQuotedWildcard + */ + public function testUnwrapQuotedWildcard(): void + { + $this->assertSame( + 'table*', + $this->solr->unwrapQuotedWildcard('"table"*') + ); + } + + /** + * @covers Solr::unwrapQuotedWildcard + */ + public function testUnwrapQuotedWildcardWithWhitespace(): void + { + $this->assertSame( + 'table*', + $this->solr->unwrapQuotedWildcard(' "table"* ') + ); + } + + /** + * @covers Solr::unwrapQuotedWildcard + */ + public function testUnwrapQuotedWildcardReturnsNullForInvalid(): void + { + $this->assertNull( + $this->solr->unwrapQuotedWildcard('"table"') + ); + + $this->assertNull( + $this->solr->unwrapQuotedWildcard('table*') + ); + + $this->assertNull( + $this->solr->unwrapQuotedWildcard('"table*"') + ); + } + + /* ============================================================ + * validateInput() + * ============================================================ + */ + + /** + * @covers Solr::validateInput + */ + + /** + * @covers Solr::validateInput + */ + public function testRejectsEmptyInput(): void + { + $result = $this->solr->validateInput(' '); + + $this->assertFalse($result['valid']); + $this->assertSame('Empty query', $result['error']); + } + + /** + * @covers Solr::validateInput + */ + public function testRejectsGarbageOnlyInput(): void + { + $result = $this->solr->validateInput('~~~///'); + + $this->assertFalse($result['valid']); + $this->assertSame('Invalid garbage-only query', $result['error']); + } + + /** + * @covers Solr::validateInput + */ + public function testRejectsInvalidSingleCharacter(): void + { + $this->assertFalse( + $this->solr->validateInput('~')['valid'] + ); + + $this->assertFalse( + $this->solr->validateInput('\\')['valid'] + ); + } + + /** + * @covers Solr::validateInput + */ + public function testRejectsLeadingWildcard(): void + { + $result = $this->solr->validateInput('*table'); + + $this->assertFalse($result['valid']); + $this->assertSame('Leading wildcard not allowed', $result['error']); + } + + /** + * @covers Solr::validateInput + */ + public function testRejectsUnbalancedParentheses(): void + { + $result = $this->solr->validateInput('(table'); + + $this->assertFalse($result['valid']); + $this->assertSame('Unbalanced parentheses', $result['error']); + } + + /** + * @covers Solr::validateInput + */ + public function testRejectsInvalidBoostSyntax(): void + { + $this->assertFalse( + $this->solr->validateInput('table^')['valid'] + ); + + $this->assertFalse( + $this->solr->validateInput('table^abc')['valid'] + ); + } + + /** + * @covers Solr::validateInput + */ + public function testAcceptsValidBoostSyntax(): void + { + $this->assertTrue( + $this->solr->validateInput('table^2')['valid'] + ); + + $this->assertTrue( + $this->solr->validateInput('table^1.5')['valid'] + ); + } + + /** + * @covers Solr::validateInput + */ + public function testAcceptsValidQueries(): void + { + $validInputs = [ + 'table', + 'table test', + 'table AND test', + '(table test)', + 'title:"data science"', + 'table~2', + 'table~0.5', + '"data science"~2' + ]; + + foreach ($validInputs as $input) { + $this->assertTrue( + $this->solr->validateInput($input)['valid'], + "Expected valid input: {$input}" + ); + } + } + + /** + * @covers Solr::validateInput + */ + public function testRejectsInvalidFuzzyTerms(): void + { + $invalid = [ + 'table~~2', + 'table~abc', + 'table~2~', + '"table"~abc', + 'table^2^3', + 'table~' + ]; + + foreach ($invalid as $input) { + $result = $this->solr->validateInput($input); + $this->assertFalse( + $result['valid'], + "Invalid fuzzy syntax not allowed: {$input}" + ); + } + } + + /** ============================================================ + * Fielded query validation (field:value) + * ============================================================ + * This test is to check if the query input is valid. + * The Catalog search does not support this field syntax queries + * but as this query input is valid, we accept it. In general, the result of this query is empty or unexpected + + * @covers Solr::validateInput + */ + public function testAcceptsValidFieldedQueries(): void + { + $valid = [ + 'title:table', + 'author:smith', + 'title:"data science"', + 'title:table~2', + 'title:"machine learning"~3', + 'title:table^2', + 'title:table AND author:smith', + ]; + + foreach ($valid as $input) { + $this->assertTrue( + $this->solr->validateInput($input)['valid'], + "Expected valid fielded query: {$input}" + ); + } + } + + /** + * This test is to check if the query input is valid. + * The Catalog search does not support this field syntax queries + * When the query input is invalid a set of heuristics are apply to transform + * the invalid query into a valid one. + * @covers Solr::validateInput + */ + public function testRejectsInvalidFieldedQueries(): void + { + $invalid = [ + 'title:', + ':table', + 'title::table', + 'title:table^', + 'title:table^abc', + 'title:~2', + 'title:(table', + 'title:table)', + 'title:"table', + 'title:"table"~', + 'title:table~abc', + 'title:(table AND)' + ]; + + foreach ($invalid as $input) { + $result = $this->solr->validateInput($input); + $this->assertFalse( + $result['valid'], + "Expected invalid fielded query: {$input}" + ); + } + } + + + +} +?> \ No newline at end of file diff --git a/test/SolrTest.php b/test/SolrTest.php index 2cc70b1b..6928de43 100644 --- a/test/SolrTest.php +++ b/test/SolrTest.php @@ -23,6 +23,80 @@ public function test_exactmatcherify(): void $this->assertEquals('*?', $solr->exactmatcherify('!@#$%^&*()-=_+,.<>/?')); $this->assertEquals('日本', $solr->exactmatcherify('日本')); } + + /** + * @covers Solr::quoteFilterValue + */ + public function test_quoteFilterValue_escapes_internal_quotes(): void + { + $solr = new Solr('', ''); + + // Test normal value + $result = $solr->quoteFilterValue('Smith, John'); + $this->assertEquals('"Smith, John"', $result); + + // Test value with quotes + $result = $solr->quoteFilterValue('"Kao gu yu wen wu" bian ji bu'); + $this->assertEquals('"\\"Kao gu yu wen wu\\" bian ji bu"', $result); + + // Test value with backslash + $result = $solr->quoteFilterValue('\Kao gu yu wen wu bian ji bu'); + $this->assertEquals('"\\\\Kao gu yu wen wu bian ji bu"', $result); + + // Test date range (should not be quoted) + $result = $solr->quoteFilterValue('[1900 TO 2000]'); + $this->assertEquals('[1900 TO 2000]', $result); + } + + /** + * @covers Solr::quoteFilterValue + */ + public function testUnwrapQuotedWildcard(): void + { + $solr = new Solr('', ''); + + $this->assertSame('table*', $solr->unwrapQuotedWildcard('"table"*')); + $this->assertSame('machine learning*', $solr->unwrapQuotedWildcard('"machine learning"*')); + $this->assertNull($solr->unwrapQuotedWildcard('"table"')); + $this->assertNull($solr->unwrapQuotedWildcard('table*')); + $this->assertNull($solr->unwrapQuotedWildcard('"*"')); + } + + /** + * remove_quotes() + * @covers Solr::remove_quotes + */ + public function testRemoveQuotes(): void + { + $solr = new Solr('', ''); + + $this->assertSame( + 'table', + $solr->remove_quotes('"table"') + ); + + $this->assertSame( + 'table', + $solr->remove_quotes(' "table" ') + ); + + $this->assertSame( + 'table name', + $solr->remove_quotes('"table name"') + ); + + $this->assertSame( + 'table "name"', + $solr->remove_quotes('table "name"') + ); + + $this->assertSame( + 'table"name', + $solr->remove_quotes('"table"name"') + ); + } + + } ?>