You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I tried comparing the performance of pgvectorscale (HEAD) with pgvector's HNSW (HEAD) with ann-benchmark and see the following recall/throughput comparison. The plot attached is for glove-100 dataset, and I see similar plot for deep-96. I've included the ann-benchmark code & config based on pgvectorscale recommended values below. Is this expected?
module.py
import sys
import pgvector.psycopg
import psycopg
from ..base.module import BaseANN
class TimescaleDiskANN(BaseANN):
def __init__(self, metric, method_param):
self._metric = metric
self._m = method_param['M']
self._ef_construction = method_param['efConstruction']
self._cur = None
if metric == "angular":
self._query = "SELECT id FROM items ORDER BY embedding <=> %s LIMIT %s"
elif metric == "euclidean":
self._query = "SELECT id FROM items ORDER BY embedding <-> %s LIMIT %s"
else:
raise RuntimeError(f"unknown metric {metric}")
def fit(self, X):
conn = psycopg.connect(host="localhost", user="postgres", password="postgres", dbname="postgres", autocommit=True)
pgvector.psycopg.register_vector(conn)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS items")
cur.execute("CREATE TABLE items (id int, embedding vector(%d))" % X.shape[1])
cur.execute("ALTER TABLE items ALTER COLUMN embedding SET STORAGE PLAIN")
print("copying data...")
with cur.copy("COPY items (id, embedding) FROM STDIN") as copy:
for i, embedding in enumerate(X):
copy.write_row((i, embedding))
print("creating index...")
if self._metric == "angular":
print("CREATE INDEX ON items USING diskann (embedding) WITH (num_neighbors = %d, search_list_size = %d)" % (self._m, self._ef_construction))
cur.execute(
"CREATE INDEX ON items USING diskann (embedding) WITH (num_neighbors = %d, search_list_size = %d)" % (self._m, self._ef_construction)
)
elif self._metric == "euclidean":
cur.execute("CREATE INDEX ON items USING diskann (embedding) WITH (num_neighbors = %d, search_list_size = %d)" % (self._m, self._ef_construction))
else:
raise RuntimeError(f"unknown metric {self._metric}")
print("done!")
self._cur = cur
def set_query_arguments(self, query_search_list_size):
self.query_search_list_size, self.query_rescore = query_search_list_size
self._cur.execute("SET diskann.query_search_list_size = %d" % self.query_search_list_size)
self._cur.execute("SET diskann.query_rescore = %d" % self.query_rescore)
def query(self, v, n):
self._cur.execute(self._query, (v, n), binary=True, prepare=True)
return [id for id, in self._cur.fetchall()]
def get_memory_usage(self):
if self._cur is None:
return 0
self._cur.execute("SELECT pg_relation_size('items_embedding_idx')")
return self._cur.fetchone()[0] / 1024
def __str__(self):
return f"TimescaleDiskANN(num_neighbors={self._m}, search_list_size={self._ef_construction}, query_search_list_size={self.query_search_list_size}, query_rescore={self.query_rescore})"
I tried comparing the performance of pgvectorscale (HEAD) with pgvector's HNSW (HEAD) with ann-benchmark and see the following recall/throughput comparison. The plot attached is for glove-100 dataset, and I see similar plot for deep-96. I've included the ann-benchmark code & config based on pgvectorscale recommended values below. Is this expected?
module.py
config.yml
The text was updated successfully, but these errors were encountered: