hltcoe · andrewyates · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Worfklow](https://github.com/andrewyates/bsparse/workflows/pytest/badge.svg)](https://github.com/andrewyates/bsparse/actions)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 # bsparse
-bsparse is a toolkit for creating and searching learned sparse representations
+bsparse is a toolkit for creating, indexing, and searching learned sparse representations
 
 ## Usage examples
 ```
@@ -59,4 +59,41 @@ java -cp anserini-1.0.0-fatjar-AY.jar  io.anserini.index.IndexCollection \
 # 3) search index
 # Create sparse query representations in `$QUERY_VECTORS` and create an index in `$INDEX`, then:
 python -m bsparse.cli search --index $INDEX --queries $QUERY_VECTORS --out test.run --topk 1000
+
+```
+
+### Seismic backend
+
+[Seismic](https://github.com/TusKANNy/seismic) is an alternative backend that indexes learned
+sparse representations natively in Python (no Java/JAR required). The encoded JSONL files produced
+by `encode` are already in the format Seismic expects, so the same doc/query files work for both
+backends.
+
+```
+# install the Seismic Python bindings (optional dependency; only needed for this backend)
+uv pip install pyseismic-lsr
+# for best performance, build against your CPU instead:
+# RUSTFLAGS="-C target-cpu=native" uv pip install --no-binary :all: pyseismic-lsr
+
+# 1) build a Seismic index from encoded docs
+python -m bsparse.cli index --backend seismic --input nfcorpus-docs.jsonl --index $INDEX
+# --input accepts multiple files, gzipped (.gz) input, and directories of .jsonl/.jsonl.gz files;
+# if the in-memory API gives you trouble, --build-method file falls back to concatenating
+# the inputs into a temporary uncompressed JSONL file and using Seismic's file-based build
+#
+# note: seismic appends ".index.seismic" to the path, so the on-disk file is $INDEX.index.seismic;
+# search --index accepts either the build-time path or the full on-disk filename
+#
+# indexing hyperparameters are flags with defaults, e.g.:
+#   --n-postings 3000 --centroid-fraction 0.2 --summary-energy 0.5 --max-fraction 6 --min-cluster-size 2 --nknn 0
+#
+# use --variant large_vocab for collections with more than 65k unique tokens
+
+# 2) search the index and evaluate
+python -m bsparse.cli search --backend seismic --index $INDEX \
+  --queries nfcorpus-queries.jsonl --out test.run --topk 1000 \
+  --query-cut 10 --heap-factor 0.8 --qrels beir/nfcorpus/test
+
+# query-time thread count is index-independent and set via the environment:
+#   SEISMIC_THREADS=16 python -m bsparse.cli search --backend seismic ...
 ```
diff --git a/bsparse/__init__.py b/bsparse/__init__.py
@@ -10,4 +10,4 @@
 from .utils import batch_encode, get_torch_device, token_ids_to_binary_vec
 
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
diff --git a/bsparse/cli.py b/bsparse/cli.py
@@ -8,6 +8,7 @@
 COMMANDS = {
     "encode": bsparse.commands.Encode,
     "check": bsparse.commands.Check,
+    "index": bsparse.commands.Index,
     "search": bsparse.commands.Search,
     "memsearch": bsparse.commands.MemSearch,
 }

diff --git a/bsparse/commands.py b/bsparse/commands.py
@@ -10,6 +10,7 @@
 
 from bsparse import load_dict, save_dict
 from bsparse.anserini import Anserini
+from bsparse.seismic import Seismic
 from bsparse.utils import psgid_to_docid
 
 
@@ -152,27 +153,155 @@ def run(self):
         return run
 
 
+class Index(Command):
+    @classmethod
+    def add_arguments(cls, parser):
+        parser.add_argument(
+            "--input",
+            type=Path,
+            nargs="+",
+            required=True,
+            help="One or more encoded JSONL doc files (plain or .gz), or directories of such files",
+        )
+        parser.add_argument("--index", type=Path, required=True, help="Output index path")
+        parser.add_argument(
+            "--backend", type=str, default="seismic", choices=["seismic"], help="Indexing backend (default: %(default)s)"
+        )
+        # Seismic index-building hyperparameters (these affect the index, so they are kwargs, not env vars).
+        # Defaults match the recommended config in the Seismic guidelines.
+        parser.add_argument(
+            "--n-postings", type=int, default=3000, help="[Seismic only] avg postings per list (default: %(default)s)"
+        )
+        parser.add_argument(
+            "--centroid-fraction",
+            type=float,
+            default=0.2,
+            help="[Seismic only] centroids per list as a fraction (default: %(default)s)",
+        )
+        parser.add_argument(
+            "--summary-energy",
+            type=float,
+            default=0.5,
+            help="[Seismic only] fraction of summary L1 norm to keep (default: %(default)s)",
+        )
+        parser.add_argument(
+            "--min-cluster-size", type=int, default=2, help="[Seismic only] minimum cluster size (default: %(default)s)"
+        )
+        parser.add_argument(
+            "--max-fraction",
+            type=float,
+            default=6,
+            help="[Seismic only] max summary block size as a fraction (default: %(default)s)",
+        )
+        parser.add_argument(
+            "--nknn", type=int, default=0, help="[Seismic only] kNN graph size; 0 disables it (default: %(default)s)"
+        )
+        parser.add_argument(
+            "--batched-indexing",
+            type=int,
+            default=100000,
+            help="[Seismic only] docs per indexing batch (default: %(default)s)",
+        )
+        parser.add_argument(
+            "--variant",
+            type=str,
+            default="standard",
+            choices=["standard", "large_vocab"],
+            help="[Seismic only] index variant; use large_vocab for >65k tokens (default: %(default)s)",
+        )
+        parser.add_argument(
+            "--build-method",
+            type=str,
+            default="dataset",
+            choices=["dataset", "file"],
+            help="[Seismic only] feed docs via the in-memory dataset API, or via a temporary "
+            "uncompressed JSONL file as a fallback (default: %(default)s)",
+        )
+
+    def __init__(self, config):
+        self.cfg = config
+
+    def run(self):
+        if self.cfg.backend == "seismic":
+            Seismic.build(
+                self.cfg.input,
+                self.cfg.index,
+                n_postings=self.cfg.n_postings,
+                centroid_fraction=self.cfg.centroid_fraction,
+                summary_energy=self.cfg.summary_energy,
+                min_cluster_size=self.cfg.min_cluster_size,
+                max_fraction=self.cfg.max_fraction,
+                nknn=self.cfg.nknn,
+                batched_indexing=self.cfg.batched_indexing,
+                variant=self.cfg.variant,
+                method=self.cfg.build_method,
+            )
+        else:
+            raise ValueError(f"unknown indexing backend: {self.cfg.backend}")
+
+
 class Search(Command):
     @classmethod
     def add_arguments(cls, parser):
-        parser.add_argument("--index", type=Path, required=True, help="Anserini index path")
+        parser.add_argument("--index", type=Path, required=True, help="Index path")
         parser.add_argument("--queries", type=Path, required=True, help="Query file path")
         parser.add_argument("--out", type=Path, required=True, help="Output file path")
+        parser.add_argument(
+            "--backend",
+            type=str,
+            default="anserini",
+            choices=["anserini", "seismic"],
+            help="Search backend (default: %(default)s)",
+        )
         parser.add_argument("--topk", type=int, default=1000, help="Top K results to return (default: %(default)s)")
         parser.add_argument("--qrels", type=str, default=None, help="Relevance judgments dataset (default: %(default)s)")
+        # Backend-specific args default to None so we can tell whether the user set them: unset args fall
+        # back to the backend's own default, and passing an arg for a different backend is rejected (see run()).
+        # anserini-specific
+        parser.add_argument("--scale", type=int, default=None, help="[Anserini only] impact scaling factor (default: 50)")
+        # seismic-specific
+        parser.add_argument("--query-cut", type=int, default=None, help="[Seismic only] query_cut (default: 10)")
+        parser.add_argument("--heap-factor", type=float, default=None, help="[Seismic only] heap_factor (default: 0.8)")
+
+    # maps each backend to the args that only apply to it
+    BACKEND_ARGS = {"anserini": ["scale"], "seismic": ["query_cut", "heap_factor"]}
 
     def __init__(self, config):
         self.cfg = config
 
+    def _check_backend_args(self):
+        """Reject args that only apply to a backend other than the one selected."""
+        for backend, names in self.BACKEND_ARGS.items():
+            if backend == self.cfg.backend:
+                continue
+            misused = [f"--{name.replace('_', '-')}" for name in names if getattr(self.cfg, name) is not None]
+            if misused:
+                raise ValueError(f"{', '.join(misused)} only valid with the {backend} backend, not '{self.cfg.backend}'")
+
     def run(self):
+        self._check_backend_args()
+
         if self.cfg.out.is_dir():
             raise ValueError(f"--out is a directory: {self.cfg.out}")
 
         queries = load_dict(self.cfg.queries)
         queries.ids = [psgid_to_docid(qid) for qid in queries.ids]
-
-        anserini = Anserini(self.cfg.index.as_posix())
-        results = anserini.query_from_vectors([{"vector": rep} for rep in queries.weights], k=self.cfg.topk)
+        vectors = [{"vector": rep} for rep in queries.weights]
+
+        if self.cfg.backend == "anserini":
+            retriever = Anserini(self.cfg.index.as_posix())
+            kwargs = {} if self.cfg.scale is None else {"scale": self.cfg.scale}
+            results = retriever.query_from_vectors(vectors, k=self.cfg.topk, **kwargs)
+        elif self.cfg.backend == "seismic":
+            retriever = Seismic(self.cfg.index.as_posix())
+            kwargs = {}
+            if self.cfg.query_cut is not None:
+                kwargs["query_cut"] = self.cfg.query_cut
+            if self.cfg.heap_factor is not None:
+                kwargs["heap_factor"] = self.cfg.heap_factor
+            results = retriever.query_from_vectors(vectors, k=self.cfg.topk, **kwargs)
+        else:
+            raise ValueError(f"unknown search backend: {self.cfg.backend}")
 
         run = TRECRun(dict(zip(queries.ids, results))).aggregate_docids(psgid_to_docid).topk(self.cfg.topk)
         print(f"saving run to: {self.cfg.out}")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,4 +10,4 @@
		from .utils import batch_encode, get_torch_device, token_ids_to_binary_vec


		__version__ = "0.1.0"
		__version__ = "0.2.0"