Skip to content

Commit de1a6c1

Browse files
Pollepsjoepio
authored andcommitted
Fix search index not removing old versions of resources #1048
1 parent 521ee2b commit de1a6c1

File tree

4 files changed

+112
-14
lines changed

4 files changed

+112
-14
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Changelog
22

3+
## UNRELEASED
4+
5+
- [#1048](https://github.com/atomicdata-dev/atomic-server/issues/1048) Fix search index not removing old versions of resources.
6+
37
List of changes for this repo, including `atomic-cli`, `atomic-server` and `atomic-lib`.
48
By far most changes relate to `atomic-server`, so if not specified, assume the changes are relevant only for the server.
59
**Changes to JS assets (including the front-end and JS libraries) are not shown here**, but in [`/browser/CHANGELOG`](/browser/CHANGELOG.md).

server/src/commit_monitor.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,18 +119,17 @@ impl CommitMonitor {
119119
tracing::debug!("No subscribers for {}", target);
120120
}
121121

122+
self.search_state.remove_resource(&target)?;
123+
122124
// Update the search index
123125
if let Some(resource) = &msg.commit_response.resource_new {
124126
// We could one day re-(allow) to keep old resources,
125127
// but then we also should index the older versions when re-indexing.
126-
self.search_state.remove_resource(&target)?;
127128
// Add new resource to search index
128129
self.search_state.add_resource(resource, &self.store)?;
129-
self.run_expensive_next_tick = true;
130-
} else {
131-
// If there is no new resource, it must have been deleted, so let's remove it from the search index.
132-
self.search_state.remove_resource(&target)?;
133130
}
131+
132+
self.run_expensive_next_tick = true;
134133
Ok(())
135134
}
136135

server/src/config.rs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ pub struct Opts {
7474
#[clap(long, env = "ATOMIC_DATA_DIR")]
7575
pub data_dir: Option<PathBuf>,
7676

77+
/// Path for the atomic data cache folder. Contains search index, temp files and more. Default value depends on your OS.
78+
#[clap(long, env = "ATOMIC_CACHE_DIR")]
79+
pub cache_dir: Option<PathBuf>,
80+
7781
/// CAUTION: Skip authentication checks, making all data publicly readable. Improves performance.
7882
#[clap(long, env = "ATOMIC_PUBLIC_MODE")]
7983
pub public_mode: bool,
@@ -203,6 +207,21 @@ pub fn read_opts() -> Opts {
203207
Opts::parse()
204208
}
205209

210+
pub fn build_temp_config(random_id: &str) -> AtomicServerResult<Config> {
211+
let opts = Opts::parse_from([
212+
"atomic-server",
213+
"--initialize",
214+
"--data-dir",
215+
&format!("./.temp/{}/db", random_id),
216+
"--config-dir",
217+
&format!("./.temp/{}/config", random_id),
218+
"--cache-dir",
219+
&format!("./.temp/{}/cache", random_id),
220+
]);
221+
222+
build_config(opts)
223+
}
224+
206225
/// Creates the server config, reads .env values and sets defaults
207226
pub fn build_config(opts: Opts) -> AtomicServerResult<Config> {
208227
// Directories & file system
@@ -242,9 +261,12 @@ pub fn build_config(opts: Opts) -> AtomicServerResult<Config> {
242261

243262
// Cache data
244263

245-
let cache_dir = project_dirs.cache_dir();
264+
let cache_dir = opts
265+
.cache_dir
266+
.clone()
267+
.unwrap_or_else(|| project_dirs.cache_dir().to_owned());
246268

247-
let mut search_index_path = cache_dir.to_owned();
269+
let mut search_index_path = cache_dir;
248270
search_index_path.push("search_index");
249271

250272
let initialize = !std::path::Path::exists(&store_path) || opts.initialize;

server/src/search.rs

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ impl SearchState {
105105
#[tracing::instrument(skip(self, store))]
106106
pub fn add_resource(&self, resource: &Resource, store: &Db) -> AtomicServerResult<()> {
107107
let fields = self.get_schema_fields()?;
108-
let subject = resource.get_subject();
108+
let subject = resource.get_subject().to_string();
109109
let writer = self.writer.read()?;
110110

111111
let mut doc = tantivy::TantivyDocument::default();
@@ -153,7 +153,19 @@ impl SearchState {
153153
pub fn build_schema() -> AtomicServerResult<tantivy::schema::Schema> {
154154
let mut schema_builder = tantivy::schema::Schema::builder();
155155
// The STORED flag makes the index store the full values. Can be useful.
156-
schema_builder.add_text_field("subject", TEXT | STORED);
156+
157+
// The raw tokenizer is used to index the subject field as is, without any tokenization.
158+
// If we don't do this the subject will be split into multiple tokens which breaks the search.
159+
schema_builder.add_text_field(
160+
"subject",
161+
tantivy::schema::TextOptions::default()
162+
.set_stored()
163+
.set_indexing_options(
164+
tantivy::schema::TextFieldIndexing::default()
165+
.set_tokenizer("raw")
166+
.set_index_option(tantivy::schema::IndexRecordOption::Basic),
167+
),
168+
);
157169
schema_builder.add_text_field("title", TEXT | STORED);
158170
schema_builder.add_text_field("description", TEXT | STORED);
159171
schema_builder.add_json_field("propvals", STORED | TEXT);
@@ -177,6 +189,12 @@ pub fn get_index(config: &Config) -> AtomicServerResult<(IndexWriter, tantivy::I
177189
e
178190
)
179191
})?;
192+
193+
// Register the raw tokenizer
194+
index
195+
.tokenizers()
196+
.register("raw", tantivy::tokenizer::RawTokenizer::default());
197+
180198
let heap_size_bytes = 50_000_000;
181199
let index_writer = index.writer(heap_size_bytes)?;
182200
Ok((index_writer, index))
@@ -245,9 +263,9 @@ fn get_resource_title(resource: &Resource) -> String {
245263

246264
#[cfg(test)]
247265
mod tests {
266+
use super::*;
248267
use atomic_lib::{urls, Resource, Storelike};
249268

250-
use super::resource_to_facet;
251269
#[test]
252270
fn facet_contains_subfacet() {
253271
let store = atomic_lib::Db::init_temp("facet_contains").unwrap();
@@ -278,11 +296,66 @@ mod tests {
278296
let query_facet_direct_parent = resource_to_facet(&resources[1], &store).unwrap();
279297
let query_facet_root = resource_to_facet(&resources[0], &store).unwrap();
280298

281-
// println!("Index: {:?}", index_facet);
282-
// println!("query direct: {:?}", query_facet_direct_parent);
283-
// println!("query root: {:?}", query_facet_root);
284-
285299
assert!(query_facet_direct_parent.is_prefix_of(&index_facet));
286300
assert!(query_facet_root.is_prefix_of(&index_facet));
287301
}
302+
303+
#[test]
304+
fn test_update_resource() {
305+
let unique_string = atomic_lib::utils::random_string(10);
306+
307+
let config = crate::config::build_temp_config(&unique_string)
308+
.map_err(|e| format!("Initialization failed: {}", e))
309+
.expect("failed init config");
310+
311+
let store = atomic_lib::Db::init_temp(&unique_string).unwrap();
312+
313+
let search_state = SearchState::new(&config).unwrap();
314+
let fields = search_state.get_schema_fields().unwrap();
315+
316+
// Create initial resource
317+
let mut resource = Resource::new_generate_subject(&store);
318+
resource
319+
.set_string(urls::NAME.into(), "Initial Title", &store)
320+
.unwrap();
321+
store.add_resource(&resource).unwrap();
322+
323+
// Add to search index
324+
search_state.add_resource(&resource, &store).unwrap();
325+
search_state.writer.write().unwrap().commit().unwrap();
326+
327+
// Update the resource
328+
resource
329+
.set_string(urls::NAME.into(), "Updated Title", &store)
330+
.unwrap();
331+
resource.save(&store).unwrap();
332+
333+
// Update in search index
334+
search_state
335+
.remove_resource(resource.get_subject())
336+
.unwrap();
337+
search_state.add_resource(&resource, &store).unwrap();
338+
search_state.writer.write().unwrap().commit().unwrap();
339+
340+
// Make sure changes are visible to searcher
341+
search_state.reader.reload().unwrap();
342+
343+
let searcher = search_state.reader.searcher();
344+
345+
// Search for the old title - should return no results
346+
let query_parser =
347+
tantivy::query::QueryParser::for_index(&search_state.index, vec![fields.title]);
348+
let query = query_parser.parse_query("Initial").unwrap();
349+
let top_docs = searcher
350+
.search(&query, &tantivy::collector::TopDocs::with_limit(1))
351+
.unwrap();
352+
assert_eq!(top_docs.len(), 0, "Old title should not be found in index");
353+
354+
// Search for the new title - should return one result
355+
let query = query_parser.parse_query("Updated").unwrap();
356+
let top_docs = searcher
357+
.search(&query, &tantivy::collector::TopDocs::with_limit(1))
358+
.unwrap();
359+
assert_eq!(top_docs.len(), 1, "New title should be found in index");
360+
}
288361
}

0 commit comments

Comments
 (0)