-
Notifications
You must be signed in to change notification settings - Fork 637
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for searching stopwords as keywords #9117
Changes from all commits
484720f
3b6a414
5d6f310
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
DROP aggregate tsvector_agg (tsvector); | ||
|
||
CREATE OR REPLACE FUNCTION trigger_crates_name_search() RETURNS trigger AS $$ | ||
DECLARE kws TEXT; | ||
begin | ||
SELECT array_to_string(array_agg(keyword), ',') INTO kws | ||
FROM keywords INNER JOIN crates_keywords | ||
ON keywords.id = crates_keywords.keyword_id | ||
WHERE crates_keywords.crate_id = new.id; | ||
new.textsearchable_index_col := | ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.name, '')), 'A') || | ||
setweight(to_tsvector('pg_catalog.english', coalesce(kws, '')), 'B') || | ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.description, '')), 'C') || | ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.readme, '')), 'D'); | ||
return new; | ||
end | ||
$$ LANGUAGE plpgsql; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
-- This is an aggregation function that combines multiple rows of tsvector data into a single tsvector | ||
-- using the tsvector concat operator. | ||
CREATE OR REPLACE aggregate tsvector_agg (tsvector) ( | ||
STYPE = pg_catalog.tsvector, | ||
SFUNC = pg_catalog.tsvector_concat, | ||
INITCOND = '' | ||
); | ||
-- e.g. | ||
-- WITH expected AS ( | ||
-- SELECT | ||
-- 'macro:1'::tsvector || 'any:1'::tsvector AS concat | ||
-- ), | ||
-- data as ( | ||
-- SELECT * | ||
-- FROM ( | ||
-- VALUES | ||
-- ('macro:1' :: tsvector), | ||
-- ('any:1' :: tsvector) | ||
-- ) k(tv) | ||
-- ) | ||
-- SELECT | ||
-- ( SELECT concat FROM expected ), | ||
-- ( SELECT tsvector_agg(tv) FROM data ) AS agg, | ||
-- ( SELECT concat FROM expected ) = ( | ||
-- SELECT tsvector_agg(tv) FROM data | ||
-- ) AS is_eq; | ||
-- | ||
-- EOF | ||
-- concat | agg | is_eq | ||
-- -------------------+-------------------+------- | ||
-- 'any':2 'macro':1 | 'any':2 'macro':1 | t | ||
-- (1 row) | ||
|
||
-- Add support for storing keywords considered stopwords in `crates.textsearchable_index_col` by casting | ||
-- to tsvector | ||
CREATE OR REPLACE FUNCTION trigger_crates_name_search() RETURNS trigger AS $$ | ||
DECLARE kws tsvector; | ||
begin | ||
SELECT | ||
tsvector_agg( | ||
CASE WHEN length(to_tsvector('english', keyword)) != 0 THEN to_tsvector('english', keyword) | ||
ELSE (keyword || ':1')::tsvector | ||
END | ||
ORDER BY keyword | ||
) INTO kws | ||
FROM keywords INNER JOIN crates_keywords | ||
ON keywords.id = crates_keywords.keyword_id | ||
WHERE crates_keywords.crate_id = new.id; | ||
new.textsearchable_index_col := | ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.name, '')), 'A') || | ||
setweight(kws, 'B') || | ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.description, '')), 'C') || | ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.readme, '')), 'D') | ||
; | ||
return new; | ||
end | ||
$$ LANGUAGE plpgsql; | ||
|
||
|
||
-- We could update those crates with the following sql | ||
-- | ||
-- WITH keywords_with_stopwords as ( | ||
-- SELECT crate_id, keyword | ||
-- FROM keywords INNER JOIN crates_keywords | ||
-- ON id = keyword_id | ||
-- WHERE length(to_tsvector('english', keyword)) = 0 | ||
-- ) | ||
-- UPDATE crates | ||
-- SET updated_at = updated_at | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we not have to set any other columns? 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, because we don't want to modify the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just to be sure, I assume you checked that this actually does not update the (I'm a bit confused by how the automatic update is currently implemented... 🫣) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, this is quite opaque as it relies on a trigger behind the scenes. SELECT event_object_table, action_order, trigger_name, event_manipulation
FROM information_schema.triggers
WHERE event_object_table = 'crates'
;
And yes, the results of the following, which updates regardless of whether the record exists or not, indicate that no date && psql cargo_registry <<EOF
WITH keywords_with_stopwords as (
SELECT crate_id, keyword
FROM keywords JOIN crates_keywords ON id = keyword_id
WHERE length(to_tsvector('english', keyword)) = 0
), upt as (
UPDATE crates
SET updated_at = updated_at
FROM keywords_with_stopwords
WHERE id = crate_id
-- AND NOT (keyword || ':B')::tsquery @@ textsearchable_index_col
returning id
)
SELECT min(updated_at), max(updated_at), count(*)
FROM crates
WHERE id in (SELECT * from upt)
;
EOF
Tue Jul 23 16:07:26 CST 2024
min | max | count
----------------------------+----------------------------+-------
2015-12-16 00:01:49.263868 | 2024-07-13 01:40:09.733953 | 396
(1 row) |
||
-- FROM keywords_with_stopwords | ||
-- WHERE id = crate_id AND NOT (keyword || ':B')::tsquery @@ textsearchable_index_col | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since our keyword doesn't contain spaces, it's safe to directly cast it to a |
||
-- ; |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,12 +3,13 @@ | |
use crate::auth::AuthCheck; | ||
use diesel::dsl::*; | ||
use diesel::sql_types::{Array, Bool, Text}; | ||
use diesel_full_text_search::configuration::TsConfigurationByName; | ||
use diesel_full_text_search::*; | ||
use once_cell::sync::OnceCell; | ||
|
||
use crate::controllers::cargo_prelude::*; | ||
use crate::controllers::helpers::Paginate; | ||
use crate::models::{Crate, CrateOwner, CrateVersions, OwnerKind, TopVersions, Version}; | ||
use crate::models::{Crate, CrateOwner, CrateVersions, Keyword, OwnerKind, TopVersions, Version}; | ||
use crate::schema::*; | ||
use crate::util::errors::bad_request; | ||
use crate::views::EncodableCrate; | ||
|
@@ -92,9 +93,25 @@ pub async fn search(app: AppState, req: Parts) -> AppResult<Json<Value>> { | |
query = query.order(Crate::with_name(q_string).desc()); | ||
|
||
if sort == "relevance" { | ||
let q = sql::<TsQuery>("plainto_tsquery('english', ") | ||
// If the query string is not a stop word, search using `plainto_tsquery(...)`. | ||
// Else if the it is a valid keyword, search by casting it to `tsquery` with weight B(keyword). | ||
// Otherwise, search using `null::tsquery`. | ||
Comment on lines
+96
to
+98
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what happens if the query string consists of multiple terms and only a subset of them are stopwords? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's consider two terms separated by either a dash or a space. WITH data AS (
SELECT *
FROM
(
VALUES
('any-one'),
('any one'),
('one-any'),
('one any')
) q(s)
)
SELECT
s,
length(
to_tsvector('english', s)
),
plainto_tsquery('english', s),
CASE WHEN (
(
length(
to_tsvector('english', s)
) != 0
)
) THEN ('plainto_tsquery')
WHEN ('t') THEN ('cast')
ELSE 'null' END
FROM data;
we can see that all searches should be performed using the old method with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, but doesn't that mean then that the PR only solves the very specific case of searching for a single stopword? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, yes, I was unaware that
I lean towards just using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's worth noting that if we don't limit the scope to keyword ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
okay, sorry for the wasted effort :-/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No worries 🙂 |
||
let qs = sql::<TsQuery>("plainto_tsquery('english', ") | ||
.bind::<Text, _>(q_string) | ||
.sql(")"); | ||
Comment on lines
+99
to
101
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: This could also be refactored to |
||
let qs_keyword = sql::<TsQuery>("(") | ||
.bind::<Text, _>(q_string) | ||
.sql("::text || ':B')::tsquery"); | ||
let q = case_when( | ||
length(to_tsvector_with_search_config::<Text, _, _>( | ||
TsConfigurationByName("english"), | ||
q_string, | ||
)) | ||
Comment on lines
+106
to
+109
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could also construct the sql ourselves if we have concerns about it. |
||
.ne(0), | ||
qs, | ||
) | ||
.when(Keyword::valid_name(q_string).into_sql::<Bool>(), qs_keyword) | ||
.otherwise(sql::<TsQuery>("NULL::tsquery")); | ||
let rank = ts_rank_cd(crates::textsearchable_index_col, q); | ||
query = query.select(( | ||
ALL_COLUMNS, | ||
|
@@ -298,9 +315,25 @@ impl<'a> FilterParams<'a> { | |
|
||
if let Some(q_string) = self.q_string { | ||
if !q_string.is_empty() { | ||
let q = sql::<TsQuery>("plainto_tsquery('english', ") | ||
// If the query string is not a stop word, search using `plainto_tsquery(...)`. | ||
// Else if it is a valid keyword, search by casting it to `tsquery` with weight B(keyword). | ||
// Otherwise, search using `null::tsquery`. | ||
let qs = sql::<TsQuery>("plainto_tsquery('english', ") | ||
.bind::<Text, _>(q_string) | ||
.sql(")"); | ||
let qs_keyword = sql::<TsQuery>("(") | ||
.bind::<Text, _>(q_string) | ||
.sql("::text || ':B')::tsquery"); | ||
let q = case_when( | ||
length(to_tsvector_with_search_config::<Text, _, _>( | ||
TsConfigurationByName("english"), | ||
q_string, | ||
)) | ||
.ne(0), | ||
qs, | ||
) | ||
.when(Keyword::valid_name(q_string).into_sql::<Bool>(), qs_keyword) | ||
.otherwise(sql::<TsQuery>("NULL::tsquery")); | ||
query = query.filter( | ||
q.matches(crates::textsearchable_index_col) | ||
.or(Crate::loosly_matches_name(q_string)), | ||
|
@@ -518,10 +551,26 @@ impl<'a> FilterParams<'a> { | |
// `WHERE (exact_match = exact_match' AND rank = rank' AND name > name') | ||
// OR (exact_match = exact_match' AND rank < rank') | ||
// OR exact_match < exact_match'` | ||
// If the query string is not a stop word, search using `plainto_tsquery(...)`. | ||
// Else if it is a valid keyword, search by casting it to `tsquery` with weight B(keyword). | ||
// Otherwise, search using `null::tsquery`. | ||
let q_string = self.q_string.expect("q_string should not be None"); | ||
let q = sql::<TsQuery>("plainto_tsquery('english', ") | ||
let qs = sql::<TsQuery>("plainto_tsquery('english', ") | ||
.bind::<Text, _>(q_string) | ||
.sql(")"); | ||
let qs_keyword = sql::<TsQuery>("(") | ||
.bind::<Text, _>(q_string) | ||
.sql("::text || ':B')::tsquery"); | ||
let q = case_when( | ||
length(to_tsvector_with_search_config::<Text, _, _>( | ||
TsConfigurationByName("english"), | ||
q_string, | ||
)) | ||
.ne(0), | ||
qs, | ||
) | ||
.when(Keyword::valid_name(q_string).into_sql::<Bool>(), qs_keyword) | ||
.otherwise(sql::<TsQuery>("NULL::tsquery")); | ||
let rank = ts_rank_cd(crates::textsearchable_index_col, q); | ||
let name_exact_match = Crate::with_name(q_string); | ||
vec![ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what does this do? could probably use a comment :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an aggregation function that combines multiple rows of
tsvector
data into a singletsvector
using thetsvector
concat
operator (||
).e.g.