-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[WIP] test: evaluating audio vec ES index and search (#77)
* test: evaluating audio vec ES index and search * docs: delete stored documents
- Loading branch information
1 parent
812cd1d
commit ad94ad7
Showing
6 changed files
with
246 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
numpy==1.26.4 | ||
librosa==0.10.1 | ||
panns-inference==0.1.1 |
96 changes: 96 additions & 0 deletions
96
src/api/core/operators/audio_vec_embedding_requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# | ||
# This file is autogenerated by pip-compile with Python 3.12 | ||
# by the following command: | ||
# | ||
# pip-compile audio_vec_embedding_requirements.in | ||
# | ||
audioread==3.0.1 | ||
# via librosa | ||
certifi==2024.2.2 | ||
# via requests | ||
cffi==1.16.0 | ||
# via soundfile | ||
charset-normalizer==3.3.2 | ||
# via requests | ||
contourpy==1.2.0 | ||
# via matplotlib | ||
cycler==0.12.1 | ||
# via matplotlib | ||
decorator==5.1.1 | ||
# via librosa | ||
fonttools==4.48.1 | ||
# via matplotlib | ||
idna==3.6 | ||
# via requests | ||
joblib==1.3.2 | ||
# via | ||
# librosa | ||
# scikit-learn | ||
kiwisolver==1.4.5 | ||
# via matplotlib | ||
lazy-loader==0.3 | ||
# via librosa | ||
librosa==0.10.1 | ||
# via | ||
# -r audio_vec_embedding_requirements.in | ||
# panns-inference | ||
# torchlibrosa | ||
llvmlite==0.42.0 | ||
# via numba | ||
matplotlib==3.8.2 | ||
# via panns-inference | ||
msgpack==1.0.7 | ||
# via librosa | ||
numba==0.59.0 | ||
# via librosa | ||
numpy==1.26.4 | ||
# via | ||
# -r audio_vec_embedding_requirements.in | ||
# contourpy | ||
# librosa | ||
# matplotlib | ||
# numba | ||
# scikit-learn | ||
# scipy | ||
# soxr | ||
# torchlibrosa | ||
packaging==23.2 | ||
# via | ||
# matplotlib | ||
# pooch | ||
panns-inference==0.1.1 | ||
# via -r audio_vec_embedding_requirements.in | ||
pillow==10.2.0 | ||
# via matplotlib | ||
platformdirs==4.2.0 | ||
# via pooch | ||
pooch==1.8.0 | ||
# via librosa | ||
pycparser==2.21 | ||
# via cffi | ||
pyparsing==3.1.1 | ||
# via matplotlib | ||
python-dateutil==2.8.2 | ||
# via matplotlib | ||
requests==2.31.0 | ||
# via pooch | ||
scikit-learn==1.4.0 | ||
# via librosa | ||
scipy==1.12.0 | ||
# via | ||
# librosa | ||
# scikit-learn | ||
six==1.16.0 | ||
# via python-dateutil | ||
soundfile==0.12.1 | ||
# via librosa | ||
soxr==0.3.7 | ||
# via librosa | ||
threadpoolctl==3.2.0 | ||
# via scikit-learn | ||
torchlibrosa==0.1.0 | ||
# via panns-inference | ||
typing-extensions==4.9.0 | ||
# via librosa | ||
urllib3==2.2.0 | ||
# via requests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
import unittest | ||
from unittest.case import skip | ||
import requests | ||
import pprint | ||
import os | ||
from elasticsearch import Elasticsearch | ||
from core.operators import audio_vec_embedding | ||
from time import sleep | ||
|
||
pp = pprint.PrettyPrinter(indent=4) | ||
''' | ||
Check how many documents have been indexed | ||
curl -X GET "http://es:9200/_cat/indices?v" | ||
Delete all the documents in an index | ||
curl -X POST "http://es:9200/test_audio/_delete_by_query" -H 'Content-Type: application/json' -d'{"query":{"match_all":{}}}' | ||
''' | ||
|
||
class TestAudioES(unittest.TestCase): | ||
@classmethod | ||
def setUpClass(cls) -> None: | ||
# ping es server to see if its working | ||
response = requests.get("http://es:9200") | ||
|
||
if response.status_code == 200: | ||
print("Elastic search server is running") | ||
else: | ||
print("No elasticsearch service found. Tests are bound to fail.") | ||
|
||
cls.es_host = os.environ.get("ES_HOST") | ||
try: | ||
cls.config = {"host": cls.es_host, "port": 9200, "scheme": "http"} | ||
cls.client = Elasticsearch([cls.config,]) | ||
print("Success Connecting to Elasticsearch") | ||
except Exception: | ||
print("Error Connecting to Elasticsearch") | ||
|
||
@classmethod | ||
def tearDownClass(cls) -> None: | ||
print("TEARING DOWN CLASS") | ||
pass | ||
|
||
def create_test_audio_index(self): | ||
global index_name | ||
index_name = "test_audio" | ||
index_config = { | ||
"mappings": { | ||
"_source": { | ||
"excludes": ["audio-embedding"] | ||
}, | ||
"properties": { | ||
"audio-embedding": { | ||
"type": "dense_vector", | ||
"dims": 2048, | ||
"index": True, | ||
"similarity": "cosine" | ||
}, | ||
} | ||
} | ||
} | ||
try: | ||
if self.client.indices.exists(index=index_name): | ||
print(f"Index '{index_name}' already exists.") | ||
return | ||
response = self.client.indices.create(index=index_name, body=index_config) | ||
if response["acknowledged"]: | ||
print(f"Index '{index_name}' created successfully.") | ||
else: | ||
print(f"Failed to create index '{index_name}'.") | ||
except Exception as e: | ||
print(f"Error creating index '{index_name}': {e}") | ||
|
||
@skip | ||
def test_store_audio_vector(self): | ||
# create the audio indice | ||
self.create_test_audio_index() | ||
# generate an audio vector | ||
audio_vec_embedding.initialize(param=None) | ||
audio_file_path = r'core/operators/sample_data/audio.wav' | ||
audio_emb = audio_vec_embedding.run(audio_file_path) | ||
audio_emb_vec = audio_emb.tolist() | ||
# index the vector | ||
body = { | ||
'audio-embedding' : audio_emb_vec, | ||
} | ||
result = self.client.index(index=index_name, document=body) | ||
# print(result) | ||
self.assertEqual(result["result"], "created") | ||
|
||
# @skip | ||
def test_search_audio_vector(self): | ||
# create the audio indice | ||
self.create_test_audio_index() | ||
# generate an audio vector | ||
audio_vec_embedding.initialize(param=None) | ||
audio_file_path = r'core/operators/sample_data/audio.wav' | ||
audio_emb = audio_vec_embedding.run(audio_file_path) | ||
audio_emb_vec = audio_emb.tolist() | ||
# index the vector | ||
body = { | ||
'audio-embedding' : audio_emb_vec, | ||
} | ||
self.client.index(index=index_name, document=body) | ||
# search for it | ||
query = { | ||
"query": { | ||
"script_score": { | ||
"query": {"match_all": {}}, | ||
"script": { | ||
"source": "cosineSimilarity(params.query_vector, 'audio-embedding') + 1.0", | ||
"params": {"query_vector": audio_emb_vec} | ||
} | ||
} | ||
} | ||
} | ||
search_result = self.client.search(index="test_audio", body=query) | ||
print(search_result) | ||
|
||
@skip | ||
def test_store_and_search_50files(self): | ||
self.create_test_audio_index() | ||
audio_vec_embedding.initialize(param=None) | ||
audio_folder_path = r'core/operators/sample_data/50_audio_files' | ||
for file_name in os.listdir(audio_folder_path): | ||
audio_file_path = os.path.join(audio_folder_path, file_name) | ||
audio_emb = audio_vec_embedding.run(audio_file_path) | ||
audio_emb_vec = audio_emb.tolist() | ||
body = { | ||
'audio-embedding' : audio_emb_vec, | ||
} | ||
self.client.index(index=index_name, document=body) | ||
sleep(0.5) | ||
|
||
|
||
|
||
|
||
|
||
|