diff --git a/examples/snippet.rs b/examples/snippet.rs
index 31bd2c166f..cc74424cda 100644
--- a/examples/snippet.rs
+++ b/examples/snippet.rs
@@ -59,8 +59,11 @@ fn main() -> tantivy::Result<()> {
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
- println!("snippet: {}", snippet.to_html());
- println!("custom highlighting: {}", highlight(snippet));
+
+ if let Some(snippet) = snippet {
+ println!("snippet: {}", snippet.to_html());
+ println!("custom highlighting: {}", highlight(snippet));
+ }
}
Ok(())
diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs
index 020e6b588a..e0031c0766 100644
--- a/src/snippet/mod.rs
+++ b/src/snippet/mod.rs
@@ -42,7 +42,7 @@
//! # let searcher = reader.searcher();
//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
//! snippet_generator.set_max_num_chars(100);
-//! let snippet = snippet_generator.snippet_from_doc(&doc);
+//! let snippet = snippet_generator.snippet_from_doc(&doc).unwrap();
//! let snippet_html: String = snippet.to_html();
//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des");
//! # Ok(())
@@ -115,6 +115,7 @@ impl FragmentCandidate {
#[derive(Debug)]
pub struct Snippet {
fragment: String,
+ fragment_range: Range,
highlighted: Vec>,
snippet_prefix: String,
snippet_postfix: String,
@@ -122,30 +123,16 @@ pub struct Snippet {
impl Snippet {
/// Create a new `Snippet`.
- fn new(fragment: &str, highlighted: Vec>) -> Self {
+ fn new(source_str: &str, source_range: Range, highlighted: Vec>) -> Self {
Self {
- fragment: fragment.to_string(),
+ fragment: source_str[source_range.clone()].to_string(),
+ fragment_range: source_range,
highlighted,
snippet_prefix: DEFAULT_SNIPPET_PREFIX.to_string(),
snippet_postfix: DEFAULT_SNIPPET_POSTFIX.to_string(),
}
}
- /// Create a new, empty, `Snippet`.
- pub fn empty() -> Snippet {
- Snippet {
- fragment: String::new(),
- highlighted: Vec::new(),
- snippet_prefix: String::new(),
- snippet_postfix: String::new(),
- }
- }
-
- /// Returns `true` if the snippet is empty.
- pub fn is_empty(&self) -> bool {
- self.highlighted.len() == 0
- }
-
/// Returns a highlighted html from the `Snippet`.
pub fn to_html(&self) -> String {
let mut html = String::new();
@@ -169,6 +156,12 @@ impl Snippet {
&self.fragment
}
+ /// Returns the range of the original text that the fragment was extracted
+ /// from.
+ pub fn range(&self) -> Range {
+ self.fragment_range.clone()
+ }
+
/// Returns a list of highlighted positions from the `Snippet`.
pub fn highlighted(&self) -> &[Range] {
&self.highlighted
@@ -231,7 +224,10 @@ fn search_fragments(
///
/// Takes a vector of `FragmentCandidate`s and the text.
/// Figures out the best fragment from it and creates a snippet.
-fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet {
+fn select_best_fragment_combination(
+ fragments: &[FragmentCandidate],
+ text: &str,
+) -> Option {
let best_fragment_opt = fragments.iter().max_by(|left, right| {
let cmp_score = left
.score
@@ -243,18 +239,21 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
cmp_score
}
});
- if let Some(fragment) = best_fragment_opt {
- let fragment_text = &text[fragment.start_offset..fragment.stop_offset];
- let highlighted = fragment
- .highlighted
- .iter()
- .map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
- .collect();
- Snippet::new(fragment_text, highlighted)
- } else {
- // When there are no fragments to chose from,
- // for now create an empty snippet.
- Snippet::empty()
+ match best_fragment_opt {
+ Some(fragment) => {
+ let highlighted = fragment
+ .highlighted
+ .iter()
+ .map(|item| item.start - fragment.start_offset..item.end - fragment.start_offset)
+ .collect();
+
+ Some(Snippet::new(
+ text,
+ fragment.start_offset..fragment.stop_offset,
+ highlighted,
+ ))
+ }
+ None => None,
}
}
@@ -368,7 +367,7 @@ fn is_sorted(mut it: impl Iterator- ) -> bool {
/// # let searcher = reader.searcher();
/// let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?;
/// snippet_generator.set_max_num_chars(100);
-/// let snippet = snippet_generator.snippet_from_doc(&doc);
+/// let snippet = snippet_generator.snippet_from_doc(&doc).unwrap();
/// let snippet_html: String = snippet.to_html();
/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des");
/// # Ok(())
@@ -441,11 +440,7 @@ impl SnippetGenerator {
&self.terms_text
}
- /// Generates a snippet for the given `Document`.
- ///
- /// This method extract the text associated with the `SnippetGenerator`'s field
- /// and computes a snippet.
- pub fn snippet_from_doc(&self, doc: &D) -> Snippet {
+ fn text_from_doc(&self, doc: &D) -> String {
let mut text = String::new();
for (field, value) in doc.iter_fields_and_values() {
let value = value as D::Value<'_>;
@@ -459,19 +454,68 @@ impl SnippetGenerator {
}
}
- self.snippet(text.trim())
+ text
+ }
+
+ /// Generates a snippet for the given `Document`.
+ ///
+ /// This method extract the text associated with the `SnippetGenerator`'s field
+ /// and computes a snippet.
+ pub fn snippet_from_doc(&self, doc: &D) -> Option {
+ self.snippet(self.text_from_doc(doc).trim())
+ }
+
+ /// Generates snippets for the given `Document`.
+ ///
+ /// This method extract the text associated with the `SnippetGenerator`'s field
+ /// and computes snippets.
+ pub fn snippets_from_doc(&self, doc: &D) -> Vec {
+ self.snippets(self.text_from_doc(doc).trim())
}
/// Generates a snippet for the given text.
- pub fn snippet(&self, text: &str) -> Snippet {
+ pub fn snippet(&self, text: &str) -> Option {
let fragment_candidates = search_fragments(
&mut self.tokenizer.clone(),
text,
&self.terms_text,
self.max_num_chars,
);
+
select_best_fragment_combination(&fragment_candidates[..], text)
}
+
+ /// Generates a snippet for the given text.
+ pub fn snippets(&self, text: &str) -> Vec {
+ let fragment_candidates = search_fragments(
+ &mut self.tokenizer.clone(),
+ text,
+ &self.terms_text,
+ self.max_num_chars,
+ );
+
+ let snippets = fragment_candidates
+ .iter()
+ .filter(|f| f.score > 0.0)
+ .map(|fragment| {
+ let highlighted = fragment
+ .highlighted
+ .iter()
+ .map(|item| {
+ item.start - fragment.start_offset..item.end - fragment.start_offset
+ })
+ .collect();
+
+ Snippet::new(
+ text,
+ fragment.start_offset..fragment.stop_offset,
+ highlighted,
+ )
+ })
+ .collect();
+
+ snippets
+ }
}
#[cfg(test)]
@@ -520,7 +564,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 1.9);
assert_eq!(first.stop_offset, 89);
}
- let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
+ let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(
snippet.fragment,
"Rust is a systems programming language sponsored by\nMozilla which describes it as a \
@@ -551,7 +595,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 1.0);
assert_eq!(first.stop_offset, 17);
}
- let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
+ let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(snippet.to_html(), "Rust is a systems")
}
{
@@ -571,7 +615,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.score, 0.9);
assert_eq!(first.stop_offset, 17);
}
- let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
+ let snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(snippet.to_html(), "programming language")
}
}
@@ -594,7 +638,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.stop_offset, 7);
}
- let snippet = select_best_fragment_combination(&fragments[..], text);
+ let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "c d");
assert_eq!(snippet.to_html(), "c d");
}
@@ -617,7 +661,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.start_offset, 8);
}
- let snippet = select_best_fragment_combination(&fragments[..], text);
+ let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "e f");
assert_eq!(snippet.to_html(), "e f");
}
@@ -641,7 +685,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.start_offset, 0);
}
- let snippet = select_best_fragment_combination(&fragments[..], text);
+ let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "e f g");
assert_eq!(snippet.to_html(), "e f g");
}
@@ -659,9 +703,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], text);
- assert_eq!(snippet.fragment, "");
- assert_eq!(snippet.to_html(), "");
- assert!(snippet.is_empty());
+ assert!(snippet.is_none());
}
#[test]
@@ -674,9 +716,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(fragments.len(), 0);
let snippet = select_best_fragment_combination(&fragments[..], text);
- assert_eq!(snippet.fragment, "");
- assert_eq!(snippet.to_html(), "");
- assert!(snippet.is_empty());
+ assert!(snippet.is_none());
}
#[test]
@@ -751,7 +791,7 @@ Survey in 2016, 2017, and 2018."#;
let mut snippet_generator =
SnippetGenerator::create(&searcher, &*query, text_field).unwrap();
{
- let snippet = snippet_generator.snippet(TEST_TEXT);
+ let snippet = snippet_generator.snippet(TEST_TEXT).unwrap();
assert_eq!(
snippet.to_html(),
"imperative-procedural paradigms. Rust is syntactically similar to \
@@ -761,7 +801,7 @@ Survey in 2016, 2017, and 2018."#;
}
{
snippet_generator.set_max_num_chars(90);
- let snippet = snippet_generator.snippet(TEST_TEXT);
+ let snippet = snippet_generator.snippet(TEST_TEXT).unwrap();
assert_eq!(
snippet.to_html(),
"Rust is syntactically similar to C++[according to whom?],\nbut its \
@@ -794,7 +834,7 @@ Survey in 2016, 2017, and 2018."#;
assert_eq!(first.stop_offset, 3);
}
- let snippet = select_best_fragment_combination(&fragments[..], text);
+ let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
assert_eq!(snippet.fragment, "abc");
assert_eq!(snippet.to_html(), "abc");
}
@@ -808,7 +848,7 @@ Survey in 2016, 2017, and 2018."#;
&terms,
100,
);
- let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT);
+ let mut snippet = select_best_fragment_combination(&fragments[..], TEST_TEXT).unwrap();
assert_eq!(
snippet.to_html(),
"Rust is a systems programming language sponsored by\nMozilla which \
@@ -822,6 +862,80 @@ Survey in 2016, 2017, and 2018."#;
);
}
+ #[test]
+ fn test_snippet_absolute_offsets() {
+ let text = "First sentence. The quick brown fox jumps over the lazy dog. Last sentence.";
+ let terms = btreemap! {
+ String::from("fox") => 1.0,
+ String::from("dog") => 0.9
+ };
+
+ let fragments = search_fragments(
+ &mut From::from(SimpleTokenizer::default()),
+ text,
+ &terms,
+ 100,
+ );
+
+ let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
+
+ // verify fragment range points to correct substring
+ // max_num_chars is 100, so our fragment should be the entire text
+ assert_eq!(snippet.fragment_range, 0..text.len() - 1);
+ assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment);
+
+ // verify highlighted ranges are correct relative to original text
+ let absolute_highlights: Vec<&str> = snippet
+ .highlighted
+ .iter()
+ .map(|highlight| {
+ (highlight.start + snippet.fragment_range.start)
+ ..(highlight.end + snippet.fragment_range.start)
+ })
+ .map(|range| &text[range])
+ .collect();
+
+ // "fox" and "dog" positions in original text
+ assert!(absolute_highlights.contains(&"fox")); // "fox"
+ assert!(absolute_highlights.contains(&"dog")); // "dog"
+ }
+
+ #[test]
+ fn test_snippet_absolute_offsets_with_truncation() {
+ let text = "Intro text. The quick brown fox jumps over the lazy dog. The quick brown fox \
+ jumps again. End text.";
+ let terms = btreemap! {
+ String::from("fox") => 1.0,
+ String::from("quick") => 0.9
+ };
+
+ let fragments = search_fragments(
+ &mut From::from(SimpleTokenizer::default()),
+ text,
+ &terms,
+ 30, // short max chars to force truncation
+ );
+
+ let snippet = select_best_fragment_combination(&fragments[..], text).unwrap();
+
+ // verify fragment range points to correct substring
+ assert_eq!(&text[snippet.fragment_range.clone()], snippet.fragment);
+
+ // verify highlighted ranges are correct relative to original text
+ let absolute_highlights: Vec<&str> = snippet
+ .highlighted
+ .iter()
+ .map(|range| {
+ (range.start + snippet.fragment_range.start)
+ ..(range.end + snippet.fragment_range.start)
+ })
+ .map(|range| &text[range])
+ .collect();
+
+ assert!(absolute_highlights.contains(&"quick")); // "quick"
+ assert!(absolute_highlights.contains(&"fox")); // "fox"
+ }
+
#[test]
fn test_collapse_overlapped_ranges() {
#![allow(clippy::single_range_in_vec_init)]