diff --git a/lib/muninn/index_writer.ex b/lib/muninn/index_writer.ex index 2e8dd6e..b6ea019 100644 --- a/lib/muninn/index_writer.ex +++ b/lib/muninn/index_writer.ex @@ -158,4 +158,46 @@ defmodule Muninn.IndexWriter do error -> error end end + + @doc """ + Deletes all documents matching a (field, value) term. + + Tantivy marks the matching documents as deleted; the delete becomes + visible to searches after the next `commit/1`. Useful for incremental + refresh: delete docs whose key matches `value`, then re-add the new + versions in the same transaction. + + Supports text, u64, i64, and bool fields. f64 is not supported (Tantivy + has no stable term encoding for floats). + + ## Parameters + + * `index` - The index to delete from + * `field_name` - Name of the indexed field to match against + * `value` - The value to match (string / integer / boolean) + + ## Returns + + * `:ok` - Delete queued (visible after `commit/1`) + * `{:error, reason}` - Failed (e.g. field not found, unsupported type) + + ## Examples + + :ok = Muninn.IndexWriter.add_document(index, %{"id" => "doc-1", "body" => "hello"}) + :ok = Muninn.IndexWriter.commit(index) + + :ok = Muninn.IndexWriter.delete_term(index, "id", "doc-1") + :ok = Muninn.IndexWriter.commit(index) + # "doc-1" no longer searchable + + """ + @spec delete_term(reference(), String.t(), String.t() | integer() | boolean()) :: + :ok | {:error, String.t()} + def delete_term(index, field_name, value) when is_binary(field_name) do + case Native.writer_delete_term(index, field_name, value) do + {:ok, _} -> :ok + :ok -> :ok + error -> error + end + end end diff --git a/lib/muninn/native.ex b/lib/muninn/native.ex index 56f61a6..e1ea682 100644 --- a/lib/muninn/native.ex +++ b/lib/muninn/native.ex @@ -81,6 +81,10 @@ defmodule Muninn.Native do @doc false def writer_rollback(_index), do: :erlang.nif_error(:nif_not_loaded) + @doc false + def writer_delete_term(_index, _field_name, _value), + do: :erlang.nif_error(:nif_not_loaded) + ## Reader functions @doc false diff --git a/native/muninn/src/lib.rs b/native/muninn/src/lib.rs index f8f9f71..dd54f4f 100644 --- a/native/muninn/src/lib.rs +++ b/native/muninn/src/lib.rs @@ -64,6 +64,15 @@ fn writer_rollback(index: rustler::ResourceArc) -> Result< writer::writer_rollback(index) } +#[rustler::nif] +fn writer_delete_term( + index: rustler::ResourceArc, + field_name: String, + value: rustler::Term, +) -> Result<(), String> { + writer::writer_delete_term(index, field_name, value) +} + #[rustler::nif] fn reader_new( index: rustler::ResourceArc, diff --git a/native/muninn/src/writer.rs b/native/muninn/src/writer.rs index 8901438..0cdd42a 100644 --- a/native/muninn/src/writer.rs +++ b/native/muninn/src/writer.rs @@ -1,7 +1,7 @@ use rustler::{Env, ResourceArc, Term}; use std::collections::HashMap; use tantivy::schema::FieldType; -use tantivy::TantivyDocument; +use tantivy::{TantivyDocument, Term as TantivyTerm}; use crate::index::IndexResource; @@ -101,6 +101,87 @@ pub fn writer_add_document( Ok(()) } +/// Deletes all documents containing the given term (field+value pair). +/// +/// Tantivy marks matching documents as deleted; the delete becomes visible +/// to readers after the next `commit`. Supports text, u64, i64, and bool +/// fields. f64 fields are not supported because Tantivy does not provide a +/// stable term encoding for floats. +pub fn writer_delete_term( + index_res: ResourceArc, + field_name: String, + value: Term, +) -> Result<(), String> { + let index = index_res + .index + .lock() + .map_err(|_| "Failed to acquire index lock".to_string())?; + + let schema = index.schema(); + let field = schema + .get_field(&field_name) + .map_err(|_| format!("Field '{}' not found in schema", field_name))?; + let field_entry = schema.get_field_entry(field); + + let term = match field_entry.field_type() { + FieldType::Str(_) => { + let s = value + .decode::() + .map_err(|_| "Expected string value for text field".to_string())?; + TantivyTerm::from_field_text(field, &s) + } + FieldType::U64(_) => { + let n = value + .decode::() + .or_else(|_| value.decode::().map(|i| i as u64)) + .map_err(|_| "Expected integer value for u64 field".to_string())?; + TantivyTerm::from_field_u64(field, n) + } + FieldType::I64(_) => { + let n = value + .decode::() + .or_else(|_| value.decode::().map(|u| u as i64)) + .map_err(|_| "Expected integer value for i64 field".to_string())?; + TantivyTerm::from_field_i64(field, n) + } + FieldType::Bool(_) => { + let b = value + .decode::() + .map_err(|_| "Expected boolean value for bool field".to_string())?; + TantivyTerm::from_field_bool(field, b) + } + _ => { + return Err(format!( + "Field '{}' has unsupported type for delete_term (text, u64, i64, bool only)", + field_name + )); + } + }; + + drop(index); + + let mut writer_lock = index_res + .writer + .lock() + .map_err(|_| "Failed to acquire writer lock".to_string())?; + + if writer_lock.is_none() { + let index = index_res + .index + .lock() + .map_err(|_| "Failed to acquire index lock".to_string())?; + let new_writer = index + .writer(50_000_000) + .map_err(|e| format!("Failed to create writer: {}", e))?; + *writer_lock = Some(new_writer); + } + + let writer = writer_lock.as_mut().unwrap(); + writer.delete_term(term); + + Ok(()) +} + /// Commits all pending changes to the index pub fn writer_commit(index_res: ResourceArc) -> Result<(), String> { let mut writer_lock = index_res diff --git a/test/muninn/index_writer_test.exs b/test/muninn/index_writer_test.exs index 581c591..bc2c596 100644 --- a/test/muninn/index_writer_test.exs +++ b/test/muninn/index_writer_test.exs @@ -265,6 +265,151 @@ defmodule Muninn.IndexWriterTest do end end + describe "delete_term/3" do + alias Muninn.{IndexReader, Searcher} + alias Muninn.Query.Term, as: TermQuery + + test "deletes documents matching a text term", %{test_path: test_path} do + schema = + Schema.new() + |> Schema.add_text_field("id", stored: true, indexed: true) + |> Schema.add_text_field("body", stored: true, indexed: true) + + {:ok, index} = Index.create(test_path, schema) + + :ok = + IndexWriter.add_documents(index, [ + %{"id" => "doc1", "body" => "hello"}, + %{"id" => "doc2", "body" => "world"} + ]) + + :ok = IndexWriter.commit(index) + + {:ok, reader} = IndexReader.new(index) + {:ok, searcher} = Searcher.new(reader) + + assert {:ok, %{"total_hits" => 1}} = + Searcher.search(searcher, %TermQuery{field: "id", value: "doc1"}, limit: 10) + + :ok = IndexWriter.delete_term(index, "id", "doc1") + :ok = IndexWriter.commit(index) + + {:ok, reader} = IndexReader.new(index) + {:ok, searcher} = Searcher.new(reader) + + assert {:ok, %{"total_hits" => 0}} = + Searcher.search(searcher, %TermQuery{field: "id", value: "doc1"}, limit: 10) + + assert {:ok, %{"total_hits" => 1}} = + Searcher.search(searcher, %TermQuery{field: "id", value: "doc2"}, limit: 10) + end + + test "deletes all documents matching the term", %{test_path: test_path} do + schema = + Schema.new() + |> Schema.add_text_field("category", stored: true, indexed: true) + |> Schema.add_text_field("title", stored: true, indexed: true) + + {:ok, index} = Index.create(test_path, schema) + + :ok = + IndexWriter.add_documents(index, [ + %{"category" => "draft", "title" => "A"}, + %{"category" => "draft", "title" => "B"}, + %{"category" => "published", "title" => "C"} + ]) + + :ok = IndexWriter.commit(index) + :ok = IndexWriter.delete_term(index, "category", "draft") + :ok = IndexWriter.commit(index) + + {:ok, reader} = IndexReader.new(index) + {:ok, searcher} = Searcher.new(reader) + + assert {:ok, %{"total_hits" => 0}} = + Searcher.search( + searcher, + %TermQuery{field: "category", value: "draft"}, + limit: 10 + ) + + assert {:ok, %{"total_hits" => 1}} = + Searcher.search( + searcher, + %TermQuery{field: "category", value: "published"}, + limit: 10 + ) + end + + test "delete then re-add same term (incremental refresh pattern)", %{test_path: test_path} do + schema = + Schema.new() + |> Schema.add_text_field("id", stored: true, indexed: true) + |> Schema.add_text_field("body", stored: true, indexed: true) + + {:ok, index} = Index.create(test_path, schema) + + :ok = IndexWriter.add_document(index, %{"id" => "doc1", "body" => "old content"}) + :ok = IndexWriter.commit(index) + + :ok = IndexWriter.delete_term(index, "id", "doc1") + :ok = IndexWriter.add_document(index, %{"id" => "doc1", "body" => "new content"}) + :ok = IndexWriter.commit(index) + + {:ok, reader} = IndexReader.new(index) + {:ok, searcher} = Searcher.new(reader) + + assert {:ok, %{"total_hits" => 1, "hits" => [hit]}} = + Searcher.search(searcher, %TermQuery{field: "id", value: "doc1"}, limit: 10) + + body = hit["doc"]["body"] + assert body == ["new content"] or body == "new content" + end + + test "delete on uncommitted writer is queued and applied at commit", + %{test_path: test_path} do + schema = + Schema.new() + |> Schema.add_text_field("id", stored: true, indexed: true) + + {:ok, index} = Index.create(test_path, schema) + + :ok = IndexWriter.add_document(index, %{"id" => "ghost"}) + :ok = IndexWriter.delete_term(index, "id", "ghost") + :ok = IndexWriter.commit(index) + + {:ok, reader} = IndexReader.new(index) + {:ok, searcher} = Searcher.new(reader) + + assert {:ok, %{"total_hits" => 0}} = + Searcher.search(searcher, %TermQuery{field: "id", value: "ghost"}, limit: 10) + end + + test "delete by u64 term", %{test_path: test_path} do + schema = + Schema.new() + |> Schema.add_u64_field("id", stored: true, indexed: true) + |> Schema.add_text_field("body", stored: true) + + {:ok, index} = Index.create(test_path, schema) + + :ok = IndexWriter.add_document(index, %{"id" => 42, "body" => "answer"}) + :ok = IndexWriter.add_document(index, %{"id" => 43, "body" => "next"}) + :ok = IndexWriter.commit(index) + + assert :ok = IndexWriter.delete_term(index, "id", 42) + :ok = IndexWriter.commit(index) + end + + test "returns error for unknown field", %{test_path: test_path} do + schema = Schema.new() |> Schema.add_text_field("id", stored: true, indexed: true) + {:ok, index} = Index.create(test_path, schema) + + assert {:error, reason} = IndexWriter.delete_term(index, "nope", "x") + assert reason =~ "not found" + end + end + describe "real-world scenarios" do test "e-commerce product indexing", %{test_path: test_path} do schema =