initial commit

Brannigan123 · Aug 30, 2020 · 4e359e1 · 4e359e1
commit 4e359e1
Show file tree

Hide file tree

Showing 16 changed files with 265 additions and 0 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/reverse image search.iml b/.idea/reverse image search.iml
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# Hy Image Search
+An image search implementation in python using tensorflow keras, scikit-learn, scipy and matplotlib.
+
+
+The image embeddings are generated from Xception imagenet (*can be changed/tuned from [features.py](features.py)*).
+Currently the embeddings are stored using pickle, but a database may be used instead.
+Image embeddings are compared cosine similarity and hamming.
+
+## Running search on image(s) from console args
+1. Run *[search.py](search.py)* with image path(s) as arguments and the script will display the top matches from underlying image library
+
+**Sample**
+<img src="single_img_search.png" width="600em" hspace=10 vspace=10/>
+
+## Running search on a  custom set of images
+1. Transfer the images to queries folder *[data/queries/](data/queries)* or alternatively change `query_images_folder_path` from *[paths.py](paths.py)* to your location of your image set.
+2. Run *[search.py](search.py)* and the script will display the top matches from underlying image library
+
+**Sample**
+<img src="img_search.png" width="600em" hspace=10 vspace=10/>
+
+
+## Generating embeddings for a new image set
+1. Transfer the images to images folder *[data/images/](data/images)* or alternatively change `images_folder_path` from *[paths.py](paths.py)* to your location of your image set.
+2. Run *[features.py](features.py)* and you should have the embeddings generated
+
+## Viewing the similarity map in underlying image library (PCA and t-SNE)
+1. Run *[dataset.py](dataset.py)* and you should get a visualization similar to this
+
+<img src="tSNE similarity.png" width="1000em" hspace=10 vspace=10/>
+
+
+----------
+
+*The sample images used here are from idenprof dataset*
diff --git a/data/.gitignore b/data/.gitignore
@@ -0,0 +1,2 @@
+/features.pickle
+/filenames.pickle
diff --git a/data/images/.gitignore b/data/images/.gitignore
@@ -0,0 +1,6 @@
+/*.png
+/*.PNG
+/*.jpg
+/*.JPG
+/*.jpeg
+/*.JPEG
diff --git a/data/queries/.gitignore b/data/queries/.gitignore
@@ -0,0 +1,6 @@
+/*.png
+/*.PNG
+/*.jpg
+/*.JPG
+/*.jpeg
+/*.JPEG
diff --git a/dataset.py b/dataset.py
@@ -0,0 +1,68 @@
+import os
+import pickle
+import matplotlib.pyplot as plt
+from matplotlib.offsetbox import OffsetImage, AnnotationBbox
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE
+from sklearn.preprocessing import StandardScaler
+from tensorflow.keras.preprocessing.image import load_img
+from tensorflow.keras.preprocessing.image import img_to_array
+from tqdm import tqdm
+
+import paths
+
+extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
+
+
+def get_file_list(root_dir):
+    file_list = []
+    counter = 1
+
+    for root, dirs, filenames in os.walk(root_dir):
+        for filename in tqdm(filenames):
+            if any(ext in filename for ext in extensions):
+                file_list.append(os.path.join(root, filename))
+                counter += 1
+    return sorted(file_list)
+
+
+def get_stored_features():
+    stored_filenames = pickle.load(open(paths.filenames_path, 'rb'))
+    stored_feature_list = pickle.load(open(paths.features_path, 'rb'))
+    return stored_filenames, stored_feature_list
+
+
+def visualize_features():
+    filenames, features = get_stored_features()
+
+    num_feature_dimensions = 100  # Set the number of features
+    pca = PCA(n_components=num_feature_dimensions)
+    pca.fit(features)
+    feature_list_compressed = pca.transform(features)
+
+    tsne = TSNE(n_components=2, verbose=1, n_iter=4000, metric='cosine', init='pca')
+    tsne_results = tsne.fit_transform(feature_list_compressed)
+    tsne_results = StandardScaler().fit_transform(tsne_results)
+
+    size = (45, 45)
+    imgs = [img_to_array(load_img(path, target_size=size)) / 255 for path in filenames]
+    visualize_scatter_with_images(tsne_results, imgs=imgs, size=size, zoom=0.7)
+
+
+def visualize_scatter_with_images(data, imgs, size=(28, 28), zoom=1):
+    fig, ax = plt.subplots(figsize=size)
+    artist = []
+    for xy, i in tqdm(zip(data, imgs)):
+        x, y = xy
+        img = OffsetImage(i, zoom=zoom)
+        ab = AnnotationBbox(img, (x, y), xycoords='data', frameon=False)
+        artist.append(ax.add_artist(ab))
+    ax.update_datalim(data)
+    ax.autoscale()
+    ax.axis('off')
+    plt.tight_layout(pad=1.2)
+    plt.show()
+
+
+if __name__ == "__main__":
+    visualize_features()
diff --git a/features.py b/features.py
@@ -0,0 +1,44 @@
+import pickle
+
+import numpy as np
+from tensorflow.keras.applications.xception import Xception
+from tensorflow.keras.applications.xception import preprocess_input
+from tensorflow.keras.preprocessing import image
+from tqdm import tqdm
+
+import dataset
+import paths
+
+model = Xception(weights='imagenet', include_top=False, input_shape=(224, 224, 3), pooling='max')
+
+
+def extract_features(img_path):
+    img = image.load_img(img_path, target_size=(224, 224))
+    x = image.img_to_array(img)
+    x = np.expand_dims(x, axis=0)
+    x = preprocess_input(x)
+    f = model.predict(x)
+    return f.flatten()
+
+
+def extract_dataset_features():
+    file_list = dataset.get_file_list(paths.images_folder_path)
+    feature_list = []
+
+    for filename in tqdm(file_list):
+        feature_list.append(extract_features(filename))
+
+    return file_list, feature_list
+
+
+def update_features():
+    file_list, features = extract_dataset_features()
+
+    pickle.dump(file_list, open(paths.filenames_path, 'wb'))
+    pickle.dump(features, open(paths.features_path, 'wb'))
+
+    return file_list, features
+
+
+if __name__ == "__main__":
+    update_features()
diff --git a/img_search.png b/img_search.png
diff --git a/paths.py b/paths.py
@@ -0,0 +1,5 @@
+filenames_path = 'data/filenames.pickle'
+features_path = 'data/features.pickle'
+images_folder_path = 'data/images'
+query_images_folder_path = 'data/queries'
+
diff --git a/search.py b/search.py
@@ -0,0 +1,67 @@
+import sys
+from random import shuffle
+import matplotlib.pyplot as plt
+from scipy import spatial
+from tensorflow.keras.preprocessing.image import load_img
+from tensorflow.keras.preprocessing.image import img_to_array
+from tqdm import tqdm
+
+import dataset
+import features
+import paths
+
+
+def visualize_similar_images(img_paths, max_query_imgs=7, max_matches=5):
+    img_paths = img_paths[:min(max_query_imgs, len(img_paths))]
+    fig, axs = plt.subplots(len(img_paths), max_matches + 1, figsize=(10, 10))
+
+    for i in tqdm(range(len(img_paths))):
+        img_path = img_paths[i]
+        similar = similar_images_paths(img_path, max_imgs=max_matches)
+
+        __plot_similarities__(axs[i], img_path, similar)
+
+    plt.tight_layout(h_pad=2)
+    plt.show()
+
+
+def __plot_similarities__(ax, img_path, similar):
+    ax[0].set_title('Query image', size=7)
+    ax[0].imshow(img_to_array(load_img(img_path)) / 255)
+    ax[0].axis('off')
+    ax[0].autoscale()
+
+    cnt = 1
+    for path, similarity in similar:
+        ax[cnt].imshow(img_to_array(load_img(path)) / 255)
+        ax[cnt].set_title('Related image\n similarity %f' % (similarity,), size=7)
+        ax[cnt].axis('off')
+        ax[cnt].autoscale()
+        cnt += 1
+
+
+def similar_images_paths(img_path, max_imgs=4):
+    query_features = features.extract_features(img_path)
+    stored_features = dataset.get_stored_features()
+
+    max_imgs = min(max_imgs, len(stored_features[0]))
+    similarities = []
+
+    for filename, encoding in list(zip(*stored_features)):
+        h_distance = spatial.distance.hamming(query_features, encoding)
+        c_distance = spatial.distance.cosine(query_features, encoding)
+        similarity = 1 - (h_distance + c_distance) / 2
+        similarities.append((filename, similarity))
+
+    similarities.sort(key=lambda tup: -tup[1])
+    return similarities[:max_imgs]
+
+
+if __name__ == "__main__":
+    args = sys.argv
+    if len(args) > 1:
+        visualize_similar_images(args[1:])
+    else:
+        paths = dataset.get_file_list(paths.query_images_folder_path)
+        shuffle(paths)
+        visualize_similar_images(paths)
diff --git a/single_img_search.png b/single_img_search.png
diff --git a/tSNE similarity.png b/tSNE similarity.png
-Original file line number
+Diff line change
@@ -0,0 +1,6 @@
+    /*.png
+    /*.PNG
+    /*.jpg
+    /*.JPG
+    /*.jpeg
+    /*.JPEG