Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ itertools = "0.14.0"
measure_time = "0.9.0"
arc-swap = "1.5.0"
bon = "3.3.1"
robust = "1.2"
i_triangle = "0.38.0"

columnar = { version = "0.6", path = "./columnar", package = "tantivy-columnar" }
sstable = { version = "0.6", path = "./sstable", package = "tantivy-sstable", optional = true }
Expand Down
55 changes: 55 additions & 0 deletions examples/geo_json.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use tantivy::collector::TopDocs;
use tantivy::query::SpatialQuery;
use tantivy::schema::{Schema, Value, SPATIAL, STORED, TEXT};
use tantivy::{Index, IndexWriter, TantivyDocument};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("properties", STORED | TEXT);
schema_builder.add_spatial_field("geometry", STORED | SPATIAL);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json(
&schema,
r#"{
"type":"Feature",
"geometry":{
"type":"Polygon",
"coordinates":[[[-99.483911,45.577697],[-99.483869,45.571457],[-99.481739,45.571461],[-99.474881,45.571584],[-99.473167,45.571615],[-99.463394,45.57168],[-99.463391,45.57883],[-99.463368,45.586076],[-99.48177,45.585926],[-99.48384,45.585953],[-99.483885,45.57873],[-99.483911,45.577697]]]
},
"properties":{
"admin_level":"8",
"border_type":"city",
"boundary":"administrative",
"gnis:feature_id":"1267426",
"name":"Hosmer",
"place":"city",
"source":"TIGER/Line® 2008 Place Shapefiles (http://www.census.gov/geo/www/tiger/)",
"wikidata":"Q2442118",
"wikipedia":"en:Hosmer, South Dakota"
}
}"#,
)?;
index_writer.add_document(doc)?;
index_writer.commit()?;

let reader = index.reader()?;
let searcher = reader.searcher();
let field = schema.get_field("geometry").unwrap();
let query = SpatialQuery::new(
field,
[(-99.49, 45.56), (-99.45, 45.59)],
tantivy::query::SpatialQueryType::Intersects,
);
let hits = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_score, doc_address) in &hits {
let retrieved_doc: TantivyDocument = searcher.doc(*doc_address)?;
if let Some(field_value) = retrieved_doc.get_first(field) {
if let Some(geometry_box) = field_value.as_value().into_geometry() {
println!("Retrieved geometry: {:?}", geometry_box);
}
}
}
assert_eq!(hits.len(), 1);
Ok(())
}
3 changes: 3 additions & 0 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,9 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
ReferenceValueLeaf::IpAddr(_) => {
unimplemented!("IP address support in dynamic fields is not yet implemented")
}
ReferenceValueLeaf::Geometry(_) => {
unimplemented!("Geometry support in dynamic fields is not implemented")
}
},
ReferenceValue::Array(elements) => {
for val in elements {
Expand Down
6 changes: 6 additions & 0 deletions src/fastfield/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ impl FastFieldsWriter {
.record_str(doc_id, field_name, &token.text);
}
}
ReferenceValueLeaf::Geometry(_) => {
panic!("Geometry fields should not be routed to fast field writer")
}
},
ReferenceValue::Array(val) => {
// TODO: Check this is the correct behaviour we want.
Expand Down Expand Up @@ -320,6 +323,9 @@ fn record_json_value_to_columnar_writer<'a, V: Value<'a>>(
"Pre-tokenized string support in dynamic fields is not yet implemented"
)
}
ReferenceValueLeaf::Geometry(_) => {
unimplemented!("Geometry support in dynamic fields is not yet implemented")
}
},
ReferenceValue::Array(elements) => {
for el in elements {
Expand Down
1 change: 1 addition & 0 deletions src/index/index_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ impl SegmentMeta {
SegmentComponent::FastFields => ".fast".to_string(),
SegmentComponent::FieldNorms => ".fieldnorm".to_string(),
SegmentComponent::Delete => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
SegmentComponent::Spatial => ".spatial".to_string(),
});
PathBuf::from(path)
}
Expand Down
5 changes: 4 additions & 1 deletion src/index/segment_component.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@ pub enum SegmentComponent {
/// Bitset describing which document of the segment is alive.
/// (It was representing deleted docs but changed to represent alive docs from v0.17)
Delete,
/// HUSH
Spatial,
}

impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
static SEGMENT_COMPONENTS: [SegmentComponent; 9] = [
SegmentComponent::Postings,
SegmentComponent::Positions,
SegmentComponent::FastFields,
Expand All @@ -42,6 +44,7 @@ impl SegmentComponent {
SegmentComponent::Store,
SegmentComponent::TempStore,
SegmentComponent::Delete,
SegmentComponent::Spatial,
];
SEGMENT_COMPONENTS.iter()
}
Expand Down
11 changes: 11 additions & 0 deletions src/index/segment_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::index::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::json_utils::json_path_sep_to_dot;
use crate::schema::{Field, IndexRecordOption, Schema, Type};
use crate::space_usage::SegmentSpaceUsage;
use crate::spatial::reader::SpatialReaders;
use crate::store::StoreReader;
use crate::termdict::TermDictionary;
use crate::{DocId, Opstamp};
Expand Down Expand Up @@ -43,6 +44,7 @@ pub struct SegmentReader {
positions_composite: CompositeFile,
fast_fields_readers: FastFieldReaders,
fieldnorm_readers: FieldNormReaders,
spatial_readers: SpatialReaders,

store_file: FileSlice,
alive_bitset_opt: Option<AliveBitSet>,
Expand Down Expand Up @@ -92,6 +94,11 @@ impl SegmentReader {
&self.fast_fields_readers
}

/// HUSH
pub fn spatial_fields(&self) -> &SpatialReaders {
&self.spatial_readers
}

/// Accessor to the `FacetReader` associated with a given `Field`.
pub fn facet_reader(&self, field_name: &str) -> crate::Result<FacetReader> {
let schema = self.schema();
Expand Down Expand Up @@ -173,6 +180,8 @@ impl SegmentReader {
let fast_fields_readers = FastFieldReaders::open(fast_fields_data, schema.clone())?;
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let spatial_data = segment.open_read(SegmentComponent::Spatial)?;
let spatial_readers = SpatialReaders::open(spatial_data)?;

let original_bitset = if segment.meta().has_deletes() {
let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?;
Expand All @@ -198,6 +207,7 @@ impl SegmentReader {
postings_composite,
fast_fields_readers,
fieldnorm_readers,
spatial_readers,
segment_id: segment.id(),
delete_opstamp: segment.meta().delete_opstamp(),
store_file,
Expand Down Expand Up @@ -460,6 +470,7 @@ impl SegmentReader {
self.positions_composite.space_usage(),
self.fast_fields_readers.space_usage(self.schema())?,
self.fieldnorm_readers.space_usage(),
self.spatial_readers.space_usage(),
self.get_store_reader(0)?.space_usage(),
self.alive_bitset_opt
.as_ref()
Expand Down
73 changes: 72 additions & 1 deletion src/indexer/merger.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::collections::HashMap;
use std::io::Write;
use std::sync::Arc;

use columnar::{
Expand All @@ -6,6 +8,7 @@ use columnar::{
use common::ReadOnlyBitSet;
use itertools::Itertools;
use measure_time::debug_time;
use tempfile::NamedTempFile;

use crate::directory::WritePtr;
use crate::docset::{DocSet, TERMINATED};
Expand All @@ -17,6 +20,8 @@ use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
use crate::indexer::SegmentSerializer;
use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
use crate::spatial::bkd::LeafPageIterator;
use crate::spatial::triangle::Triangle;
use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal};
use crate::{DocAddress, DocId, InvertedIndexReader};
Expand Down Expand Up @@ -520,6 +525,71 @@ impl IndexMerger {
Ok(())
}

fn write_spatial_fields(
&self,
serializer: &mut SegmentSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
use crate::spatial::bkd::Segment;
let mut segment_mappings: Vec<Vec<Option<DocId>>> = Vec::new();
for reader in &self.readers {
let max_doc = reader.max_doc();
segment_mappings.push(vec![None; max_doc as usize]);
}
for (new_doc_id, old_doc_addr) in doc_id_mapping.iter_old_doc_addrs().enumerate() {
segment_mappings[old_doc_addr.segment_ord as usize][old_doc_addr.doc_id as usize] =
Some(new_doc_id as DocId);
}
let mut temp_files: HashMap<Field, NamedTempFile> = HashMap::new();

for (field, field_entry) in self.schema.fields() {
if matches!(field_entry.field_type(), FieldType::Spatial(_)) {
temp_files.insert(field, NamedTempFile::new()?);
}
}
for (segment_ord, reader) in self.readers.iter().enumerate() {
for (field, temp_file) in &mut temp_files {
let spatial_readers = reader.spatial_fields();
let spatial_reader = match spatial_readers.get_field(*field)? {
Some(reader) => reader,
None => continue,
};
let segment = Segment::new(spatial_reader.get_bytes());
for triangle_result in LeafPageIterator::new(&segment) {
let triangles = triangle_result?;
for triangle in triangles {
if let Some(new_doc_id) =
segment_mappings[segment_ord][triangle.doc_id as usize]
{
for &word in &triangle.words {
temp_file.write_all(&word.to_le_bytes())?;
}
temp_file.write_all(&new_doc_id.to_le_bytes())?;
}
}
}
}
}
if let Some(mut spatial_serializer) = serializer.extract_spatial_serializer() {
for (field, mut temp_file) in temp_files {
// Flush and sync triangles.
temp_file.flush()?;
temp_file.as_file_mut().sync_all()?;
// Memory map the triangle file.
use memmap2::MmapOptions;
let mmap = unsafe { MmapOptions::new().map_mut(temp_file.as_file())? };
// Cast to &[Triangle] slice
let triangle_count = mmap.len() / std::mem::size_of::<Triangle>();
let triangles = unsafe {
std::slice::from_raw_parts_mut(mmap.as_ptr() as *mut Triangle, triangle_count)
};
// Get spatial writer and rebuild block kd-tree.
spatial_serializer.serialize_field(field, triangles)?;
}
}
Ok(())
}

/// Writes the merged segment by pushing information
/// to the `SegmentSerializer`.
///
Expand All @@ -544,9 +614,10 @@ impl IndexMerger {

debug!("write-storagefields");
self.write_storable_fields(serializer.get_store_writer())?;
debug!("write-spatialfields");
self.write_spatial_fields(&mut serializer, &doc_id_mapping)?;
debug!("write-fastfields");
self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?;

debug!("close-serializer");
serializer.close()?;
Ok(self.max_doc)
Expand Down
14 changes: 14 additions & 0 deletions src/indexer/segment_serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use crate::directory::WritePtr;
use crate::fieldnorm::FieldNormsSerializer;
use crate::index::{Segment, SegmentComponent};
use crate::postings::InvertedIndexSerializer;
use crate::spatial::serializer::SpatialSerializer;
use crate::store::StoreWriter;

/// Segment serializer is in charge of laying out on disk
Expand All @@ -12,6 +13,7 @@ pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
fast_field_write: WritePtr,
spatial_serializer: Option<SpatialSerializer>,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
Expand All @@ -35,11 +37,15 @@ impl SegmentSerializer {
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;

let spatial_write = segment.open_write(SegmentComponent::Spatial)?;
let spatial_serializer = SpatialSerializer::from_write(spatial_write)?;

let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
Ok(SegmentSerializer {
segment,
store_writer,
fast_field_write,
spatial_serializer: Some(spatial_serializer),
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
})
Expand All @@ -64,6 +70,11 @@ impl SegmentSerializer {
&mut self.fast_field_write
}

/// HUSH
pub fn extract_spatial_serializer(&mut self) -> Option<SpatialSerializer> {
self.spatial_serializer.take()
}

/// Extract the field norm serializer.
///
/// Note the fieldnorms serializer can only be extracted once.
Expand All @@ -81,6 +92,9 @@ impl SegmentSerializer {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
if let Some(spatial_serializer) = self.extract_spatial_serializer() {
spatial_serializer.close()?;
}
self.fast_field_write.terminate()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
Expand Down
Loading