Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for searching stopwords as keywords #9117

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
DROP aggregate tsvector_agg (tsvector);

CREATE OR REPLACE FUNCTION trigger_crates_name_search() RETURNS trigger AS $$
DECLARE kws TEXT;
begin
SELECT array_to_string(array_agg(keyword), ',') INTO kws
FROM keywords INNER JOIN crates_keywords
ON keywords.id = crates_keywords.keyword_id
WHERE crates_keywords.crate_id = new.id;
new.textsearchable_index_col :=
setweight(to_tsvector('pg_catalog.english', coalesce(new.name, '')), 'A') ||
setweight(to_tsvector('pg_catalog.english', coalesce(kws, '')), 'B') ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.description, '')), 'C') ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.readme, '')), 'D');
return new;
end
$$ LANGUAGE plpgsql;
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
-- This is an aggregation function that combines multiple rows of tsvector data into a single tsvector
-- using the tsvector concat operator.
CREATE OR REPLACE aggregate tsvector_agg (tsvector) (
STYPE = pg_catalog.tsvector,
SFUNC = pg_catalog.tsvector_concat,
INITCOND = ''
);
Comment on lines +3 to +7
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this do? could probably use a comment :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an aggregation function that combines multiple rows of tsvector data into a single tsvector using the tsvector concat operator (||).

e.g.

WITH expected AS (
  SELECT
    'macro:1'::tsvector || 'any:1'::tsvector AS concat
),
data as (
  SELECT *
  FROM (
    VALUES
      ('macro:1' :: tsvector),
      ('any:1' :: tsvector)
  ) k(tv)
)
SELECT
  ( SELECT concat FROM expected ),
  ( SELECT tsvector_agg(tv) FROM data ) AS agg,
  ( SELECT concat FROM expected ) = (
    SELECT tsvector_agg(tv) FROM data
  ) AS is_eq;
      concat       |        agg        | is_eq
-------------------+-------------------+-------
 'any':2 'macro':1 | 'any':2 'macro':1 | t
(1 row)

-- e.g.
-- WITH expected AS (
-- SELECT
-- 'macro:1'::tsvector || 'any:1'::tsvector AS concat
-- ),
-- data as (
-- SELECT *
-- FROM (
-- VALUES
-- ('macro:1' :: tsvector),
-- ('any:1' :: tsvector)
-- ) k(tv)
-- )
-- SELECT
-- ( SELECT concat FROM expected ),
-- ( SELECT tsvector_agg(tv) FROM data ) AS agg,
-- ( SELECT concat FROM expected ) = (
-- SELECT tsvector_agg(tv) FROM data
-- ) AS is_eq;
--
-- EOF
-- concat | agg | is_eq
-- -------------------+-------------------+-------
-- 'any':2 'macro':1 | 'any':2 'macro':1 | t
-- (1 row)

-- Add support for storing keywords considered stopwords in `crates.textsearchable_index_col` by casting
-- to tsvector
CREATE OR REPLACE FUNCTION trigger_crates_name_search() RETURNS trigger AS $$
DECLARE kws tsvector;
begin
SELECT
tsvector_agg(
CASE WHEN length(to_tsvector('english', keyword)) != 0 THEN to_tsvector('english', keyword)
ELSE (keyword || ':1')::tsvector
END
ORDER BY keyword
) INTO kws
FROM keywords INNER JOIN crates_keywords
ON keywords.id = crates_keywords.keyword_id
WHERE crates_keywords.crate_id = new.id;
new.textsearchable_index_col :=
setweight(to_tsvector('pg_catalog.english', coalesce(new.name, '')), 'A') ||
setweight(kws, 'B') ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.description, '')), 'C') ||
setweight(to_tsvector('pg_catalog.english', coalesce(new.readme, '')), 'D')
;
return new;
end
$$ LANGUAGE plpgsql;


-- We could update those crates with the following sql
--
-- WITH keywords_with_stopwords as (
-- SELECT crate_id, keyword
-- FROM keywords INNER JOIN crates_keywords
-- ON id = keyword_id
-- WHERE length(to_tsvector('english', keyword)) = 0
-- )
-- UPDATE crates
-- SET updated_at = updated_at
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we not have to set any other columns? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, because we don't want to modify the updated_at value. We only need to trigger trigger_crates_tsvector_update to update the textsearchable_index_col column, and I think this is sufficient.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to be sure, I assume you checked that this actually does not update the updated_at column? 😅

(I'm a bit confused by how the automatic update is currently implemented... 🫣)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I'm a bit confused by how the automatic update is currently implemented... 🫣)

Yes, this is quite opaque as it relies on a trigger behind the scenes.
We could inspect the triggers with the following sql:

SELECT event_object_table, action_order, trigger_name, event_manipulation
FROM information_schema.triggers
WHERE event_object_table = 'crates'
;
 event_object_table | action_order |              trigger_name              | event_manipulation 
--------------------+--------------+----------------------------------------+--------------------
 crates             |            1 | insert_crate_downloads_row             | INSERT
 crates             |            1 | trigger_crates_tsvector_update         | INSERT
 crates             |            2 | trigger_ensure_crate_name_not_reserved | INSERT
 crates             |            1 | trigger_crates_set_updated_at          | UPDATE
 crates             |            2 | trigger_crates_tsvector_update         | UPDATE
 crates             |            3 | trigger_ensure_crate_name_not_reserved | UPDATE
(6 rows)

just to be sure, I assume you checked that this actually does not update the updated_at column? 😅

And yes, the results of the following, which updates regardless of whether the record exists or not, indicate that no updated_at value is currently current_datetime:

date && psql cargo_registry <<EOF
WITH keywords_with_stopwords as (
        SELECT crate_id, keyword
        FROM keywords JOIN crates_keywords ON id = keyword_id
        WHERE length(to_tsvector('english', keyword)) = 0
), upt as (
  UPDATE crates
  SET updated_at = updated_at
  FROM keywords_with_stopwords
  WHERE id = crate_id
  -- AND NOT (keyword || ':B')::tsquery @@ textsearchable_index_col
  returning id
)
SELECT min(updated_at), max(updated_at), count(*)
FROM crates
WHERE id in (SELECT * from upt)
;

EOF
Tue Jul 23 16:07:26 CST 2024
            min             |            max             | count 
----------------------------+----------------------------+-------
 2015-12-16 00:01:49.263868 | 2024-07-13 01:40:09.733953 |   396
(1 row)

-- FROM keywords_with_stopwords
-- WHERE id = crate_id AND NOT (keyword || ':B')::tsquery @@ textsearchable_index_col
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since our keyword doesn't contain spaces, it's safe to directly cast it to a tsquery.

-- ;
57 changes: 53 additions & 4 deletions src/controllers/krate/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
use crate::auth::AuthCheck;
use diesel::dsl::*;
use diesel::sql_types::{Array, Bool, Text};
use diesel_full_text_search::configuration::TsConfigurationByName;
use diesel_full_text_search::*;
use once_cell::sync::OnceCell;

use crate::controllers::cargo_prelude::*;
use crate::controllers::helpers::Paginate;
use crate::models::{Crate, CrateOwner, CrateVersions, OwnerKind, TopVersions, Version};
use crate::models::{Crate, CrateOwner, CrateVersions, Keyword, OwnerKind, TopVersions, Version};
use crate::schema::*;
use crate::util::errors::bad_request;
use crate::views::EncodableCrate;
Expand Down Expand Up @@ -92,9 +93,25 @@ pub async fn search(app: AppState, req: Parts) -> AppResult<Json<Value>> {
query = query.order(Crate::with_name(q_string).desc());

if sort == "relevance" {
let q = sql::<TsQuery>("plainto_tsquery('english', ")
// If the query string is not a stop word, search using `plainto_tsquery(...)`.
// Else if the it is a valid keyword, search by casting it to `tsquery` with weight B(keyword).
// Otherwise, search using `null::tsquery`.
Comment on lines +96 to +98
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens if the query string consists of multiple terms and only a subset of them are stopwords?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's consider two terms separated by either a dash or a space.
With the following sql:

WITH data AS (    
  SELECT *
  FROM                                                
    (
      VALUES
        ('any-one'),
        ('any one'),
        ('one-any'),
        ('one any')           
    ) q(s)                 
)
SELECT
  s,                              
  length(                                      
    to_tsvector('english', s)        
  ),                                 
  plainto_tsquery('english', s),
  CASE WHEN (
    (
      length(
        to_tsvector('english', s)
      ) != 0
    )
  ) THEN ('plainto_tsquery')
  WHEN ('t') THEN ('cast')
  ELSE 'null' END
FROM  data;
    s    | length |  plainto_tsquery  |      case       
---------+--------+-------------------+-----------------
 any-one |      2 | 'any-on' & 'one'  | plainto_tsquery
 any one |      1 | 'one'             | plainto_tsquery
 one-any |      2 | 'one-ani' & 'one' | plainto_tsquery
 one any |      1 | 'one'             | plainto_tsquery
(4 rows)

we can see that all searches should be performed using the old method withplainto_tsquery because the length of to_tsvector is not 0. This indicates that the results are either a single non-stopword stem (if separated by a space) or multiple non-stopword stems (if separated by a dash).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, but doesn't that mean then that the PR only solves the very specific case of searching for a single stopword?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, yes, I was unaware that keyword: was available in the frontend, and the old issue is still open. 😓


#9117 (comment)

Oh, indeed! Perhaps we should add a help button or document somewhere to inform users about some of these search tricks?

I lean towards just using keyword:. Feel free to close it as it's not planned.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's worth noting that if we don't limit the scope to keyword (:B) some stopwords like a, an or others that might frequently appear in the readme of multiple crates could become noise.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I lean towards just using keyword:. Feel free to close it as it's not planned.

okay, sorry for the wasted effort :-/

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries 🙂‍↔️

let qs = sql::<TsQuery>("plainto_tsquery('english', ")
.bind::<Text, _>(q_string)
.sql(")");
Comment on lines +99 to 101
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: This could also be refactored to plainto_tsquery_with_search_config, which I've implemented upstream in diesel-rs/diesel_full_text_search#41. I incorrectly used to_tsquery_with_search_config in the #7941 , which is not equivalent to plainto_tsquery and caused an issue #8052 . And since carol has concerns about it so I haven't modified it in this PR.

let qs_keyword = sql::<TsQuery>("(")
.bind::<Text, _>(q_string)
.sql("::text || ':B')::tsquery");
let q = case_when(
length(to_tsvector_with_search_config::<Text, _, _>(
TsConfigurationByName("english"),
q_string,
))
Comment on lines +106 to +109
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also construct the sql ourselves if we have concerns about it.

.ne(0),
qs,
)
.when(Keyword::valid_name(q_string).into_sql::<Bool>(), qs_keyword)
.otherwise(sql::<TsQuery>("NULL::tsquery"));
let rank = ts_rank_cd(crates::textsearchable_index_col, q);
query = query.select((
ALL_COLUMNS,
Expand Down Expand Up @@ -298,9 +315,25 @@ impl<'a> FilterParams<'a> {

if let Some(q_string) = self.q_string {
if !q_string.is_empty() {
let q = sql::<TsQuery>("plainto_tsquery('english', ")
// If the query string is not a stop word, search using `plainto_tsquery(...)`.
// Else if it is a valid keyword, search by casting it to `tsquery` with weight B(keyword).
// Otherwise, search using `null::tsquery`.
let qs = sql::<TsQuery>("plainto_tsquery('english', ")
.bind::<Text, _>(q_string)
.sql(")");
let qs_keyword = sql::<TsQuery>("(")
.bind::<Text, _>(q_string)
.sql("::text || ':B')::tsquery");
let q = case_when(
length(to_tsvector_with_search_config::<Text, _, _>(
TsConfigurationByName("english"),
q_string,
))
.ne(0),
qs,
)
.when(Keyword::valid_name(q_string).into_sql::<Bool>(), qs_keyword)
.otherwise(sql::<TsQuery>("NULL::tsquery"));
query = query.filter(
q.matches(crates::textsearchable_index_col)
.or(Crate::loosly_matches_name(q_string)),
Expand Down Expand Up @@ -518,10 +551,26 @@ impl<'a> FilterParams<'a> {
// `WHERE (exact_match = exact_match' AND rank = rank' AND name > name')
// OR (exact_match = exact_match' AND rank < rank')
// OR exact_match < exact_match'`
// If the query string is not a stop word, search using `plainto_tsquery(...)`.
// Else if it is a valid keyword, search by casting it to `tsquery` with weight B(keyword).
// Otherwise, search using `null::tsquery`.
let q_string = self.q_string.expect("q_string should not be None");
let q = sql::<TsQuery>("plainto_tsquery('english', ")
let qs = sql::<TsQuery>("plainto_tsquery('english', ")
.bind::<Text, _>(q_string)
.sql(")");
let qs_keyword = sql::<TsQuery>("(")
.bind::<Text, _>(q_string)
.sql("::text || ':B')::tsquery");
let q = case_when(
length(to_tsvector_with_search_config::<Text, _, _>(
TsConfigurationByName("english"),
q_string,
))
.ne(0),
qs,
)
.when(Keyword::valid_name(q_string).into_sql::<Bool>(), qs_keyword)
.otherwise(sql::<TsQuery>("NULL::tsquery"));
let rank = ts_rank_cd(crates::textsearchable_index_col, q);
let name_exact_match = Crate::with_name(q_string);
vec![
Expand Down
56 changes: 56 additions & 0 deletions src/tests/routes/crates/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1084,6 +1084,62 @@ async fn crates_by_user_id_not_including_deleted_owners() {
}
}

#[tokio::test(flavor = "multi_thread")]
async fn crates_with_stopword_keyword() {
let (app, anon, user) = TestApp::init().with_user();
let user = user.as_model();
app.db(|conn| {
CrateBuilder::new("any", user.id)
.readme("readme")
.description("description")
.keyword("kw1")
.expect_build(conn);

CrateBuilder::new("short-stopword", user.id)
.keyword("kw1")
.keyword("an")
.expect_build(conn);

CrateBuilder::new("ANY_INDEX_QUERIES", user.id)
.keyword("KW1")
.expect_build(conn);

CrateBuilder::new("foo-kw-is-stopword", user.id)
.keyword("any")
.keyword("kw3")
.expect_build(conn);

CrateBuilder::new("bar-kw-is-stopword", user.id)
.keyword("any")
.keyword("kw1")
.expect_build(conn);
});

for json in search_both(&anon, "q=any").await {
assert_eq!(json.crates.len(), 4);
assert_eq!(json.meta.total, 4);
assert_eq!(json.crates[0].name, "any");
assert_eq!(json.crates[1].name, "bar-kw-is-stopword");
assert_eq!(json.crates[2].name, "foo-kw-is-stopword");
assert_eq!(json.crates[3].name, "ANY_INDEX_QUERIES");
}

for json in search_both(&anon, "q=an").await {
assert_eq!(json.crates.len(), 3);
assert_eq!(json.meta.total, 3);
assert_eq!(json.crates[0].name, "short-stopword");
assert_eq!(json.crates[1].name, "ANY_INDEX_QUERIES");
assert_eq!(json.crates[2].name, "any");
}

// Both `an` and `any` are stopwords.
// The query string `an any` is not a valid keyword
for json in search_both(&anon, "q=an%20any").await {
assert_eq!(json.crates.len(), 0);
assert_eq!(json.meta.total, 0);
}
}

static PAGE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"((?:^page|&page|\?page)=\d+)").unwrap());

// search with both offset-based (prepend with `page=1` query) and seek-based pagination
Expand Down