[WIP] test: evaluating audio vec ES index and search (#77)

* test: evaluating audio vec ES index and search * docs: delete stored documents
tattle-made · Feb 13, 2024 · ad94ad7 · ad94ad7
1 parent 812cd1d
commit ad94ad7
Show file tree

Hide file tree

Showing 6 changed files with 246 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,8 @@ src/api/core/operators/yolov8n-seg.pt
 
 # SonarQube
 .scannerwork
+
+# Audio Files
+**100_audio_files/
+**50_audio_files/
+
diff --git a/src/api/Dockerfile b/src/api/Dockerfile
@@ -13,11 +13,12 @@ RUN apt-get update \
     && rm -rf /var/lib/apt/lists/* 
 ENV PATH=/root/.local/bin:$PATH
 RUN pip install --no-cache-dir --upgrade pip
-# RUN apt-get update && apt-get -y upgrade && apt-get install -y vim curl
+RUN apt-get update && apt-get -y upgrade && apt-get install -y vim curl
 # RUN apt-get install -y ffmpeg
 # RUN apt-get update && \
 #     apt-get -y upgrade && \
 #     apt-get install -y tesseract-ocr tesseract-ocr-hin
+RUN apt-get update && apt-get -y upgrade && apt-get install wget
 WORKDIR /app
 COPY requirements.txt /app/requirements.txt
 RUN pip install --no-cache-dir --user -r requirements.txt

diff --git a/src/api/core/operators/audio_vec_embedding.py b/src/api/core/operators/audio_vec_embedding.py
@@ -33,14 +33,9 @@ def run(audio_file):
     return normalized_v
 
 # if __name__ == "__main__":
-#     import json
-#     import os
-#     audio_file_path = r'sample_data/google-dataset/a-cappella-chorus.wav'
+#     import time
+#     audio_file_path = r'sample_data/audio.wav'
 #     initialize(param={})
-#     audio_filename = os.path.splitext(os.path.basename(audio_file_path))[0]
 #     audio_emb = run(audio_file_path)
 #     audio_emb_list = audio_emb.tolist()
-#     print(audio_emb_list)
-    # json_filename = fr"sample_data/jsons/{audio_filename}_emb.json"
-    # with open(json_filename, 'w') as f:
-    #     json.dump(audio_emb_list, f)
+#     print(audio_emb_list)
diff --git a/src/api/core/operators/audio_vec_embedding_requirements.in b/src/api/core/operators/audio_vec_embedding_requirements.in
@@ -0,0 +1,3 @@
+numpy==1.26.4
+librosa==0.10.1
+panns-inference==0.1.1
diff --git a/src/api/core/operators/audio_vec_embedding_requirements.txt b/src/api/core/operators/audio_vec_embedding_requirements.txt
@@ -0,0 +1,96 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    pip-compile audio_vec_embedding_requirements.in
+#
+audioread==3.0.1
+    # via librosa
+certifi==2024.2.2
+    # via requests
+cffi==1.16.0
+    # via soundfile
+charset-normalizer==3.3.2
+    # via requests
+contourpy==1.2.0
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+decorator==5.1.1
+    # via librosa
+fonttools==4.48.1
+    # via matplotlib
+idna==3.6
+    # via requests
+joblib==1.3.2
+    # via
+    #   librosa
+    #   scikit-learn
+kiwisolver==1.4.5
+    # via matplotlib
+lazy-loader==0.3
+    # via librosa
+librosa==0.10.1
+    # via
+    #   -r audio_vec_embedding_requirements.in
+    #   panns-inference
+    #   torchlibrosa
+llvmlite==0.42.0
+    # via numba
+matplotlib==3.8.2
+    # via panns-inference
+msgpack==1.0.7
+    # via librosa
+numba==0.59.0
+    # via librosa
+numpy==1.26.4
+    # via
+    #   -r audio_vec_embedding_requirements.in
+    #   contourpy
+    #   librosa
+    #   matplotlib
+    #   numba
+    #   scikit-learn
+    #   scipy
+    #   soxr
+    #   torchlibrosa
+packaging==23.2
+    # via
+    #   matplotlib
+    #   pooch
+panns-inference==0.1.1
+    # via -r audio_vec_embedding_requirements.in
+pillow==10.2.0
+    # via matplotlib
+platformdirs==4.2.0
+    # via pooch
+pooch==1.8.0
+    # via librosa
+pycparser==2.21
+    # via cffi
+pyparsing==3.1.1
+    # via matplotlib
+python-dateutil==2.8.2
+    # via matplotlib
+requests==2.31.0
+    # via pooch
+scikit-learn==1.4.0
+    # via librosa
+scipy==1.12.0
+    # via
+    #   librosa
+    #   scikit-learn
+six==1.16.0
+    # via python-dateutil
+soundfile==0.12.1
+    # via librosa
+soxr==0.3.7
+    # via librosa
+threadpoolctl==3.2.0
+    # via scikit-learn
+torchlibrosa==0.1.0
+    # via panns-inference
+typing-extensions==4.9.0
+    # via librosa
+urllib3==2.2.0
+    # via requests
diff --git a/src/api/test_audio_es_vec.py b/src/api/test_audio_es_vec.py
@@ -0,0 +1,137 @@
+import unittest
+from unittest.case import skip
+import requests
+import pprint
+import os
+from elasticsearch import Elasticsearch
+from core.operators import audio_vec_embedding
+from time import sleep
+
+pp = pprint.PrettyPrinter(indent=4)
+'''
+Check how many documents have been indexed
+curl -X GET "http://es:9200/_cat/indices?v"
+Delete all the documents in an index
+curl -X POST "http://es:9200/test_audio/_delete_by_query" -H 'Content-Type: application/json' -d'{"query":{"match_all":{}}}'
+'''
+
+class TestAudioES(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        # ping es server to see if its working
+        response = requests.get("http://es:9200")
+
+        if response.status_code == 200:
+            print("Elastic search server is running")
+        else:
+            print("No elasticsearch service found. Tests are bound to fail.")
+
+        cls.es_host = os.environ.get("ES_HOST")
+        try:
+            cls.config = {"host": cls.es_host, "port": 9200, "scheme": "http"}
+            cls.client = Elasticsearch([cls.config,])
+            print("Success Connecting to Elasticsearch")
+        except Exception:
+            print("Error Connecting to Elasticsearch")
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        print("TEARING DOWN CLASS")
+        pass
+
+    def create_test_audio_index(self):
+        global index_name
+        index_name = "test_audio"
+        index_config = {
+            "mappings": {
+                "_source": {
+                    "excludes": ["audio-embedding"]
+                },
+                "properties": {
+                "audio-embedding": {
+                    "type": "dense_vector",
+                    "dims": 2048,
+                    "index": True,
+                    "similarity": "cosine"
+                },
+                }
+            }
+        }
+        try:
+            if self.client.indices.exists(index=index_name):
+                print(f"Index '{index_name}' already exists.")
+                return
+            response = self.client.indices.create(index=index_name, body=index_config)
+            if response["acknowledged"]:
+                print(f"Index '{index_name}' created successfully.")
+            else:
+                print(f"Failed to create index '{index_name}'.")
+        except Exception as e:
+            print(f"Error creating index '{index_name}': {e}")
+
+    @skip
+    def test_store_audio_vector(self):
+        # create the audio indice
+        self.create_test_audio_index()
+        # generate an audio vector
+        audio_vec_embedding.initialize(param=None)
+        audio_file_path = r'core/operators/sample_data/audio.wav'
+        audio_emb = audio_vec_embedding.run(audio_file_path)
+        audio_emb_vec = audio_emb.tolist()
+        # index the vector
+        body = {
+            'audio-embedding' : audio_emb_vec,
+        }
+        result = self.client.index(index=index_name, document=body)
+        # print(result)
+        self.assertEqual(result["result"], "created")
+
+    # @skip
+    def test_search_audio_vector(self):
+        # create the audio indice
+        self.create_test_audio_index()
+        # generate an audio vector
+        audio_vec_embedding.initialize(param=None)
+        audio_file_path = r'core/operators/sample_data/audio.wav'
+        audio_emb = audio_vec_embedding.run(audio_file_path)
+        audio_emb_vec = audio_emb.tolist()
+        # index the vector
+        body = {
+            'audio-embedding' : audio_emb_vec,
+        }
+        self.client.index(index=index_name, document=body)
+        # search for it
+        query = {
+            "query": {
+                "script_score": {
+                    "query": {"match_all": {}},
+                    "script": {
+                        "source": "cosineSimilarity(params.query_vector, 'audio-embedding') + 1.0",
+                        "params": {"query_vector": audio_emb_vec}
+                    }
+                }
+            }
+        }
+        search_result = self.client.search(index="test_audio", body=query)
+        print(search_result)
+
+    @skip
+    def test_store_and_search_50files(self):
+        self.create_test_audio_index()
+        audio_vec_embedding.initialize(param=None)
+        audio_folder_path = r'core/operators/sample_data/50_audio_files'
+        for file_name in os.listdir(audio_folder_path):
+            audio_file_path = os.path.join(audio_folder_path, file_name)
+            audio_emb = audio_vec_embedding.run(audio_file_path)
+            audio_emb_vec = audio_emb.tolist()
+            body = {
+                'audio-embedding' : audio_emb_vec,
+            }
+            self.client.index(index=index_name, document=body)
+            sleep(0.5)
+
+
+
+
+
+