Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![Worfklow](https://github.com/andrewyates/bsparse/workflows/pytest/badge.svg)](https://github.com/andrewyates/bsparse/actions)
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
# bsparse
bsparse is a toolkit for creating and searching learned sparse representations
bsparse is a toolkit for creating, indexing, and searching learned sparse representations

## Usage examples
```
Expand Down Expand Up @@ -59,4 +59,41 @@ java -cp anserini-1.0.0-fatjar-AY.jar io.anserini.index.IndexCollection \
# 3) search index
# Create sparse query representations in `$QUERY_VECTORS` and create an index in `$INDEX`, then:
python -m bsparse.cli search --index $INDEX --queries $QUERY_VECTORS --out test.run --topk 1000

```

### Seismic backend

[Seismic](https://github.com/TusKANNy/seismic) is an alternative backend that indexes learned
sparse representations natively in Python (no Java/JAR required). The encoded JSONL files produced
by `encode` are already in the format Seismic expects, so the same doc/query files work for both
backends.

```
# install the Seismic Python bindings (optional dependency; only needed for this backend)
uv pip install pyseismic-lsr
# for best performance, build against your CPU instead:
# RUSTFLAGS="-C target-cpu=native" uv pip install --no-binary :all: pyseismic-lsr

# 1) build a Seismic index from encoded docs
python -m bsparse.cli index --backend seismic --input nfcorpus-docs.jsonl --index $INDEX
# --input accepts multiple files, gzipped (.gz) input, and directories of .jsonl/.jsonl.gz files;
# if the in-memory API gives you trouble, --build-method file falls back to concatenating
# the inputs into a temporary uncompressed JSONL file and using Seismic's file-based build
#
# note: seismic appends ".index.seismic" to the path, so the on-disk file is $INDEX.index.seismic;
# search --index accepts either the build-time path or the full on-disk filename
#
# indexing hyperparameters are flags with defaults, e.g.:
# --n-postings 3000 --centroid-fraction 0.2 --summary-energy 0.5 --max-fraction 6 --min-cluster-size 2 --nknn 0
#
# use --variant large_vocab for collections with more than 65k unique tokens

# 2) search the index and evaluate
python -m bsparse.cli search --backend seismic --index $INDEX \
--queries nfcorpus-queries.jsonl --out test.run --topk 1000 \
--query-cut 10 --heap-factor 0.8 --qrels beir/nfcorpus/test

# query-time thread count is index-independent and set via the environment:
# SEISMIC_THREADS=16 python -m bsparse.cli search --backend seismic ...
```
2 changes: 1 addition & 1 deletion bsparse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@
from .utils import batch_encode, get_torch_device, token_ids_to_binary_vec


__version__ = "0.1.0"
__version__ = "0.2.0"
1 change: 1 addition & 0 deletions bsparse/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
COMMANDS = {
"encode": bsparse.commands.Encode,
"check": bsparse.commands.Check,
"index": bsparse.commands.Index,
"search": bsparse.commands.Search,
"memsearch": bsparse.commands.MemSearch,
}
Expand Down
137 changes: 133 additions & 4 deletions bsparse/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from bsparse import load_dict, save_dict
from bsparse.anserini import Anserini
from bsparse.seismic import Seismic
from bsparse.utils import psgid_to_docid


Expand Down Expand Up @@ -152,27 +153,155 @@ def run(self):
return run


class Index(Command):
@classmethod
def add_arguments(cls, parser):
parser.add_argument(
"--input",
type=Path,
nargs="+",
required=True,
help="One or more encoded JSONL doc files (plain or .gz), or directories of such files",
)
parser.add_argument("--index", type=Path, required=True, help="Output index path")
parser.add_argument(
"--backend", type=str, default="seismic", choices=["seismic"], help="Indexing backend (default: %(default)s)"
)
# Seismic index-building hyperparameters (these affect the index, so they are kwargs, not env vars).
# Defaults match the recommended config in the Seismic guidelines.
parser.add_argument(
"--n-postings", type=int, default=3000, help="[Seismic only] avg postings per list (default: %(default)s)"
)
parser.add_argument(
"--centroid-fraction",
type=float,
default=0.2,
help="[Seismic only] centroids per list as a fraction (default: %(default)s)",
)
parser.add_argument(
"--summary-energy",
type=float,
default=0.5,
help="[Seismic only] fraction of summary L1 norm to keep (default: %(default)s)",
)
parser.add_argument(
"--min-cluster-size", type=int, default=2, help="[Seismic only] minimum cluster size (default: %(default)s)"
)
parser.add_argument(
"--max-fraction",
type=float,
default=6,
help="[Seismic only] max summary block size as a fraction (default: %(default)s)",
)
parser.add_argument(
"--nknn", type=int, default=0, help="[Seismic only] kNN graph size; 0 disables it (default: %(default)s)"
)
parser.add_argument(
"--batched-indexing",
type=int,
default=100000,
help="[Seismic only] docs per indexing batch (default: %(default)s)",
)
parser.add_argument(
"--variant",
type=str,
default="standard",
choices=["standard", "large_vocab"],
help="[Seismic only] index variant; use large_vocab for >65k tokens (default: %(default)s)",
)
parser.add_argument(
"--build-method",
type=str,
default="dataset",
choices=["dataset", "file"],
help="[Seismic only] feed docs via the in-memory dataset API, or via a temporary "
"uncompressed JSONL file as a fallback (default: %(default)s)",
)

def __init__(self, config):
self.cfg = config

def run(self):
if self.cfg.backend == "seismic":
Seismic.build(
self.cfg.input,
self.cfg.index,
n_postings=self.cfg.n_postings,
centroid_fraction=self.cfg.centroid_fraction,
summary_energy=self.cfg.summary_energy,
min_cluster_size=self.cfg.min_cluster_size,
max_fraction=self.cfg.max_fraction,
nknn=self.cfg.nknn,
batched_indexing=self.cfg.batched_indexing,
variant=self.cfg.variant,
method=self.cfg.build_method,
)
else:
raise ValueError(f"unknown indexing backend: {self.cfg.backend}")


class Search(Command):
@classmethod
def add_arguments(cls, parser):
parser.add_argument("--index", type=Path, required=True, help="Anserini index path")
parser.add_argument("--index", type=Path, required=True, help="Index path")
parser.add_argument("--queries", type=Path, required=True, help="Query file path")
parser.add_argument("--out", type=Path, required=True, help="Output file path")
parser.add_argument(
"--backend",
type=str,
default="anserini",
choices=["anserini", "seismic"],
help="Search backend (default: %(default)s)",
)
parser.add_argument("--topk", type=int, default=1000, help="Top K results to return (default: %(default)s)")
parser.add_argument("--qrels", type=str, default=None, help="Relevance judgments dataset (default: %(default)s)")
# Backend-specific args default to None so we can tell whether the user set them: unset args fall
# back to the backend's own default, and passing an arg for a different backend is rejected (see run()).
# anserini-specific
parser.add_argument("--scale", type=int, default=None, help="[Anserini only] impact scaling factor (default: 50)")
# seismic-specific
parser.add_argument("--query-cut", type=int, default=None, help="[Seismic only] query_cut (default: 10)")
parser.add_argument("--heap-factor", type=float, default=None, help="[Seismic only] heap_factor (default: 0.8)")

# maps each backend to the args that only apply to it
BACKEND_ARGS = {"anserini": ["scale"], "seismic": ["query_cut", "heap_factor"]}

def __init__(self, config):
self.cfg = config

def _check_backend_args(self):
"""Reject args that only apply to a backend other than the one selected."""
for backend, names in self.BACKEND_ARGS.items():
if backend == self.cfg.backend:
continue
misused = [f"--{name.replace('_', '-')}" for name in names if getattr(self.cfg, name) is not None]
if misused:
raise ValueError(f"{', '.join(misused)} only valid with the {backend} backend, not '{self.cfg.backend}'")

def run(self):
self._check_backend_args()

if self.cfg.out.is_dir():
raise ValueError(f"--out is a directory: {self.cfg.out}")

queries = load_dict(self.cfg.queries)
queries.ids = [psgid_to_docid(qid) for qid in queries.ids]

anserini = Anserini(self.cfg.index.as_posix())
results = anserini.query_from_vectors([{"vector": rep} for rep in queries.weights], k=self.cfg.topk)
vectors = [{"vector": rep} for rep in queries.weights]

if self.cfg.backend == "anserini":
retriever = Anserini(self.cfg.index.as_posix())
kwargs = {} if self.cfg.scale is None else {"scale": self.cfg.scale}
results = retriever.query_from_vectors(vectors, k=self.cfg.topk, **kwargs)
elif self.cfg.backend == "seismic":
retriever = Seismic(self.cfg.index.as_posix())
kwargs = {}
if self.cfg.query_cut is not None:
kwargs["query_cut"] = self.cfg.query_cut
if self.cfg.heap_factor is not None:
kwargs["heap_factor"] = self.cfg.heap_factor
results = retriever.query_from_vectors(vectors, k=self.cfg.topk, **kwargs)
else:
raise ValueError(f"unknown search backend: {self.cfg.backend}")

run = TRECRun(dict(zip(queries.ids, results))).aggregate_docids(psgid_to_docid).topk(self.cfg.topk)
print(f"saving run to: {self.cfg.out}")
Expand Down
Loading
Loading