diff --git a/migrations/2024-07-14-074318_enable-stopwords-in-textsearchable_index_col/down.sql b/migrations/2024-07-14-074318_enable-stopwords-in-textsearchable_index_col/down.sql new file mode 100644 index 00000000000..45f55c9b9a5 --- /dev/null +++ b/migrations/2024-07-14-074318_enable-stopwords-in-textsearchable_index_col/down.sql @@ -0,0 +1,17 @@ +DROP aggregate tsvector_agg (tsvector); + +CREATE OR REPLACE FUNCTION trigger_crates_name_search() RETURNS trigger AS $$ +DECLARE kws TEXT; +begin + SELECT array_to_string(array_agg(keyword), ',') INTO kws + FROM keywords INNER JOIN crates_keywords + ON keywords.id = crates_keywords.keyword_id + WHERE crates_keywords.crate_id = new.id; + new.textsearchable_index_col := + setweight(to_tsvector('pg_catalog.english', coalesce(new.name, '')), 'A') || + setweight(to_tsvector('pg_catalog.english', coalesce(kws, '')), 'B') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.description, '')), 'C') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.readme, '')), 'D'); + return new; +end +$$ LANGUAGE plpgsql; diff --git a/migrations/2024-07-14-074318_enable-stopwords-in-textsearchable_index_col/up.sql b/migrations/2024-07-14-074318_enable-stopwords-in-textsearchable_index_col/up.sql new file mode 100644 index 00000000000..cb0a4aa9366 --- /dev/null +++ b/migrations/2024-07-14-074318_enable-stopwords-in-textsearchable_index_col/up.sql @@ -0,0 +1,72 @@ +-- This is an aggregation function that combines multiple rows of tsvector data into a single tsvector +-- using the tsvector concat operator. +CREATE OR REPLACE aggregate tsvector_agg (tsvector) ( + STYPE = pg_catalog.tsvector, + SFUNC = pg_catalog.tsvector_concat, + INITCOND = '' +); +-- e.g. +-- WITH expected AS ( +-- SELECT +-- 'macro:1'::tsvector || 'any:1'::tsvector AS concat +-- ), +-- data as ( +-- SELECT * +-- FROM ( +-- VALUES +-- ('macro:1' :: tsvector), +-- ('any:1' :: tsvector) +-- ) k(tv) +-- ) +-- SELECT +-- ( SELECT concat FROM expected ), +-- ( SELECT tsvector_agg(tv) FROM data ) AS agg, +-- ( SELECT concat FROM expected ) = ( +-- SELECT tsvector_agg(tv) FROM data +-- ) AS is_eq; +-- +-- EOF +-- concat | agg | is_eq +-- -------------------+-------------------+------- +-- 'any':2 'macro':1 | 'any':2 'macro':1 | t +-- (1 row) + +-- Add support for storing keywords considered stopwords in `crates.textsearchable_index_col` by casting +-- to tsvector +CREATE OR REPLACE FUNCTION trigger_crates_name_search() RETURNS trigger AS $$ +DECLARE kws tsvector; +begin + SELECT + tsvector_agg( + CASE WHEN length(to_tsvector('english', keyword)) != 0 THEN to_tsvector('english', keyword) + ELSE (keyword || ':1')::tsvector + END + ORDER BY keyword + ) INTO kws + FROM keywords INNER JOIN crates_keywords + ON keywords.id = crates_keywords.keyword_id + WHERE crates_keywords.crate_id = new.id; + new.textsearchable_index_col := + setweight(to_tsvector('pg_catalog.english', coalesce(new.name, '')), 'A') || + setweight(kws, 'B') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.description, '')), 'C') || + setweight(to_tsvector('pg_catalog.english', coalesce(new.readme, '')), 'D') + ; + return new; +end +$$ LANGUAGE plpgsql; + + +-- We could update those crates with the following sql +-- +-- WITH keywords_with_stopwords as ( +-- SELECT crate_id, keyword +-- FROM keywords INNER JOIN crates_keywords +-- ON id = keyword_id +-- WHERE length(to_tsvector('english', keyword)) = 0 +-- ) +-- UPDATE crates +-- SET updated_at = updated_at +-- FROM keywords_with_stopwords +-- WHERE id = crate_id AND NOT (keyword || ':B')::tsquery @@ textsearchable_index_col +-- ; diff --git a/src/controllers/krate/search.rs b/src/controllers/krate/search.rs index 9ae609631be..bc55ffb3951 100644 --- a/src/controllers/krate/search.rs +++ b/src/controllers/krate/search.rs @@ -3,12 +3,13 @@ use crate::auth::AuthCheck; use diesel::dsl::*; use diesel::sql_types::{Array, Bool, Text}; +use diesel_full_text_search::configuration::TsConfigurationByName; use diesel_full_text_search::*; use once_cell::sync::OnceCell; use crate::controllers::cargo_prelude::*; use crate::controllers::helpers::Paginate; -use crate::models::{Crate, CrateOwner, CrateVersions, OwnerKind, TopVersions, Version}; +use crate::models::{Crate, CrateOwner, CrateVersions, Keyword, OwnerKind, TopVersions, Version}; use crate::schema::*; use crate::util::errors::bad_request; use crate::views::EncodableCrate; @@ -92,9 +93,25 @@ pub async fn search(app: AppState, req: Parts) -> AppResult> { query = query.order(Crate::with_name(q_string).desc()); if sort == "relevance" { - let q = sql::("plainto_tsquery('english', ") + // If the query string is not a stop word, search using `plainto_tsquery(...)`. + // Else if the it is a valid keyword, search by casting it to `tsquery` with weight B(keyword). + // Otherwise, search using `null::tsquery`. + let qs = sql::("plainto_tsquery('english', ") .bind::(q_string) .sql(")"); + let qs_keyword = sql::("(") + .bind::(q_string) + .sql("::text || ':B')::tsquery"); + let q = case_when( + length(to_tsvector_with_search_config::( + TsConfigurationByName("english"), + q_string, + )) + .ne(0), + qs, + ) + .when(Keyword::valid_name(q_string).into_sql::(), qs_keyword) + .otherwise(sql::("NULL::tsquery")); let rank = ts_rank_cd(crates::textsearchable_index_col, q); query = query.select(( ALL_COLUMNS, @@ -298,9 +315,25 @@ impl<'a> FilterParams<'a> { if let Some(q_string) = self.q_string { if !q_string.is_empty() { - let q = sql::("plainto_tsquery('english', ") + // If the query string is not a stop word, search using `plainto_tsquery(...)`. + // Else if it is a valid keyword, search by casting it to `tsquery` with weight B(keyword). + // Otherwise, search using `null::tsquery`. + let qs = sql::("plainto_tsquery('english', ") .bind::(q_string) .sql(")"); + let qs_keyword = sql::("(") + .bind::(q_string) + .sql("::text || ':B')::tsquery"); + let q = case_when( + length(to_tsvector_with_search_config::( + TsConfigurationByName("english"), + q_string, + )) + .ne(0), + qs, + ) + .when(Keyword::valid_name(q_string).into_sql::(), qs_keyword) + .otherwise(sql::("NULL::tsquery")); query = query.filter( q.matches(crates::textsearchable_index_col) .or(Crate::loosly_matches_name(q_string)), @@ -518,10 +551,26 @@ impl<'a> FilterParams<'a> { // `WHERE (exact_match = exact_match' AND rank = rank' AND name > name') // OR (exact_match = exact_match' AND rank < rank') // OR exact_match < exact_match'` + // If the query string is not a stop word, search using `plainto_tsquery(...)`. + // Else if it is a valid keyword, search by casting it to `tsquery` with weight B(keyword). + // Otherwise, search using `null::tsquery`. let q_string = self.q_string.expect("q_string should not be None"); - let q = sql::("plainto_tsquery('english', ") + let qs = sql::("plainto_tsquery('english', ") .bind::(q_string) .sql(")"); + let qs_keyword = sql::("(") + .bind::(q_string) + .sql("::text || ':B')::tsquery"); + let q = case_when( + length(to_tsvector_with_search_config::( + TsConfigurationByName("english"), + q_string, + )) + .ne(0), + qs, + ) + .when(Keyword::valid_name(q_string).into_sql::(), qs_keyword) + .otherwise(sql::("NULL::tsquery")); let rank = ts_rank_cd(crates::textsearchable_index_col, q); let name_exact_match = Crate::with_name(q_string); vec![ diff --git a/src/tests/routes/crates/list.rs b/src/tests/routes/crates/list.rs index 1682df703a5..87584522319 100644 --- a/src/tests/routes/crates/list.rs +++ b/src/tests/routes/crates/list.rs @@ -1084,6 +1084,62 @@ async fn crates_by_user_id_not_including_deleted_owners() { } } +#[tokio::test(flavor = "multi_thread")] +async fn crates_with_stopword_keyword() { + let (app, anon, user) = TestApp::init().with_user(); + let user = user.as_model(); + app.db(|conn| { + CrateBuilder::new("any", user.id) + .readme("readme") + .description("description") + .keyword("kw1") + .expect_build(conn); + + CrateBuilder::new("short-stopword", user.id) + .keyword("kw1") + .keyword("an") + .expect_build(conn); + + CrateBuilder::new("ANY_INDEX_QUERIES", user.id) + .keyword("KW1") + .expect_build(conn); + + CrateBuilder::new("foo-kw-is-stopword", user.id) + .keyword("any") + .keyword("kw3") + .expect_build(conn); + + CrateBuilder::new("bar-kw-is-stopword", user.id) + .keyword("any") + .keyword("kw1") + .expect_build(conn); + }); + + for json in search_both(&anon, "q=any").await { + assert_eq!(json.crates.len(), 4); + assert_eq!(json.meta.total, 4); + assert_eq!(json.crates[0].name, "any"); + assert_eq!(json.crates[1].name, "bar-kw-is-stopword"); + assert_eq!(json.crates[2].name, "foo-kw-is-stopword"); + assert_eq!(json.crates[3].name, "ANY_INDEX_QUERIES"); + } + + for json in search_both(&anon, "q=an").await { + assert_eq!(json.crates.len(), 3); + assert_eq!(json.meta.total, 3); + assert_eq!(json.crates[0].name, "short-stopword"); + assert_eq!(json.crates[1].name, "ANY_INDEX_QUERIES"); + assert_eq!(json.crates[2].name, "any"); + } + + // Both `an` and `any` are stopwords. + // The query string `an any` is not a valid keyword + for json in search_both(&anon, "q=an%20any").await { + assert_eq!(json.crates.len(), 0); + assert_eq!(json.meta.total, 0); + } +} + static PAGE_RE: Lazy = Lazy::new(|| Regex::new(r"((?:^page|&page|\?page)=\d+)").unwrap()); // search with both offset-based (prepend with `page=1` query) and seek-based pagination