Skip to content

Commit

Permalink
[WIP] test: evaluating audio vec ES index and search (#77)
Browse files Browse the repository at this point in the history
* test: evaluating audio vec ES index and search

* docs: delete stored documents
  • Loading branch information
aatmanvaidya authored Feb 13, 2024
1 parent 812cd1d commit ad94ad7
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 9 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,8 @@ src/api/core/operators/yolov8n-seg.pt

# SonarQube
.scannerwork

# Audio Files
**100_audio_files/
**50_audio_files/

3 changes: 2 additions & 1 deletion src/api/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ RUN apt-get update \
&& rm -rf /var/lib/apt/lists/*
ENV PATH=/root/.local/bin:$PATH
RUN pip install --no-cache-dir --upgrade pip
# RUN apt-get update && apt-get -y upgrade && apt-get install -y vim curl
RUN apt-get update && apt-get -y upgrade && apt-get install -y vim curl
# RUN apt-get install -y ffmpeg
# RUN apt-get update && \
# apt-get -y upgrade && \
# apt-get install -y tesseract-ocr tesseract-ocr-hin
RUN apt-get update && apt-get -y upgrade && apt-get install wget
WORKDIR /app
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir --user -r requirements.txt
Expand Down
11 changes: 3 additions & 8 deletions src/api/core/operators/audio_vec_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,9 @@ def run(audio_file):
return normalized_v

# if __name__ == "__main__":
# import json
# import os
# audio_file_path = r'sample_data/google-dataset/a-cappella-chorus.wav'
# import time
# audio_file_path = r'sample_data/audio.wav'
# initialize(param={})
# audio_filename = os.path.splitext(os.path.basename(audio_file_path))[0]
# audio_emb = run(audio_file_path)
# audio_emb_list = audio_emb.tolist()
# print(audio_emb_list)
# json_filename = fr"sample_data/jsons/{audio_filename}_emb.json"
# with open(json_filename, 'w') as f:
# json.dump(audio_emb_list, f)
# print(audio_emb_list)
3 changes: 3 additions & 0 deletions src/api/core/operators/audio_vec_embedding_requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
numpy==1.26.4
librosa==0.10.1
panns-inference==0.1.1
96 changes: 96 additions & 0 deletions src/api/core/operators/audio_vec_embedding_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile audio_vec_embedding_requirements.in
#
audioread==3.0.1
# via librosa
certifi==2024.2.2
# via requests
cffi==1.16.0
# via soundfile
charset-normalizer==3.3.2
# via requests
contourpy==1.2.0
# via matplotlib
cycler==0.12.1
# via matplotlib
decorator==5.1.1
# via librosa
fonttools==4.48.1
# via matplotlib
idna==3.6
# via requests
joblib==1.3.2
# via
# librosa
# scikit-learn
kiwisolver==1.4.5
# via matplotlib
lazy-loader==0.3
# via librosa
librosa==0.10.1
# via
# -r audio_vec_embedding_requirements.in
# panns-inference
# torchlibrosa
llvmlite==0.42.0
# via numba
matplotlib==3.8.2
# via panns-inference
msgpack==1.0.7
# via librosa
numba==0.59.0
# via librosa
numpy==1.26.4
# via
# -r audio_vec_embedding_requirements.in
# contourpy
# librosa
# matplotlib
# numba
# scikit-learn
# scipy
# soxr
# torchlibrosa
packaging==23.2
# via
# matplotlib
# pooch
panns-inference==0.1.1
# via -r audio_vec_embedding_requirements.in
pillow==10.2.0
# via matplotlib
platformdirs==4.2.0
# via pooch
pooch==1.8.0
# via librosa
pycparser==2.21
# via cffi
pyparsing==3.1.1
# via matplotlib
python-dateutil==2.8.2
# via matplotlib
requests==2.31.0
# via pooch
scikit-learn==1.4.0
# via librosa
scipy==1.12.0
# via
# librosa
# scikit-learn
six==1.16.0
# via python-dateutil
soundfile==0.12.1
# via librosa
soxr==0.3.7
# via librosa
threadpoolctl==3.2.0
# via scikit-learn
torchlibrosa==0.1.0
# via panns-inference
typing-extensions==4.9.0
# via librosa
urllib3==2.2.0
# via requests
137 changes: 137 additions & 0 deletions src/api/test_audio_es_vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import unittest
from unittest.case import skip
import requests
import pprint
import os
from elasticsearch import Elasticsearch
from core.operators import audio_vec_embedding
from time import sleep

pp = pprint.PrettyPrinter(indent=4)
'''
Check how many documents have been indexed
curl -X GET "http://es:9200/_cat/indices?v"
Delete all the documents in an index
curl -X POST "http://es:9200/test_audio/_delete_by_query" -H 'Content-Type: application/json' -d'{"query":{"match_all":{}}}'
'''

class TestAudioES(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
# ping es server to see if its working
response = requests.get("http://es:9200")

if response.status_code == 200:
print("Elastic search server is running")
else:
print("No elasticsearch service found. Tests are bound to fail.")

cls.es_host = os.environ.get("ES_HOST")
try:
cls.config = {"host": cls.es_host, "port": 9200, "scheme": "http"}
cls.client = Elasticsearch([cls.config,])
print("Success Connecting to Elasticsearch")
except Exception:
print("Error Connecting to Elasticsearch")

@classmethod
def tearDownClass(cls) -> None:
print("TEARING DOWN CLASS")
pass

def create_test_audio_index(self):
global index_name
index_name = "test_audio"
index_config = {
"mappings": {
"_source": {
"excludes": ["audio-embedding"]
},
"properties": {
"audio-embedding": {
"type": "dense_vector",
"dims": 2048,
"index": True,
"similarity": "cosine"
},
}
}
}
try:
if self.client.indices.exists(index=index_name):
print(f"Index '{index_name}' already exists.")
return
response = self.client.indices.create(index=index_name, body=index_config)
if response["acknowledged"]:
print(f"Index '{index_name}' created successfully.")
else:
print(f"Failed to create index '{index_name}'.")
except Exception as e:
print(f"Error creating index '{index_name}': {e}")

@skip
def test_store_audio_vector(self):
# create the audio indice
self.create_test_audio_index()
# generate an audio vector
audio_vec_embedding.initialize(param=None)
audio_file_path = r'core/operators/sample_data/audio.wav'
audio_emb = audio_vec_embedding.run(audio_file_path)
audio_emb_vec = audio_emb.tolist()
# index the vector
body = {
'audio-embedding' : audio_emb_vec,
}
result = self.client.index(index=index_name, document=body)
# print(result)
self.assertEqual(result["result"], "created")

# @skip
def test_search_audio_vector(self):
# create the audio indice
self.create_test_audio_index()
# generate an audio vector
audio_vec_embedding.initialize(param=None)
audio_file_path = r'core/operators/sample_data/audio.wav'
audio_emb = audio_vec_embedding.run(audio_file_path)
audio_emb_vec = audio_emb.tolist()
# index the vector
body = {
'audio-embedding' : audio_emb_vec,
}
self.client.index(index=index_name, document=body)
# search for it
query = {
"query": {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'audio-embedding') + 1.0",
"params": {"query_vector": audio_emb_vec}
}
}
}
}
search_result = self.client.search(index="test_audio", body=query)
print(search_result)

@skip
def test_store_and_search_50files(self):
self.create_test_audio_index()
audio_vec_embedding.initialize(param=None)
audio_folder_path = r'core/operators/sample_data/50_audio_files'
for file_name in os.listdir(audio_folder_path):
audio_file_path = os.path.join(audio_folder_path, file_name)
audio_emb = audio_vec_embedding.run(audio_file_path)
audio_emb_vec = audio_emb.tolist()
body = {
'audio-embedding' : audio_emb_vec,
}
self.client.index(index=index_name, document=body)
sleep(0.5)






0 comments on commit ad94ad7

Please sign in to comment.