hrushikesh009
diff --git a/‎Assessing the Performance of various Models.ipynb
Lines changed: 1 addition & 0 deletions b/‎Assessing the Performance of various Models.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Choosing features and metrics for nearest neighbor search.ipynb
Lines changed: 1 addition & 0 deletions b/‎Choosing features and metrics for nearest neighbor search.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Classification using Sentiment Analysis.ipynb
Lines changed: 1 addition & 0 deletions b/‎Classification using Sentiment Analysis.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Data Structure.ipynb
Lines changed: 1 addition & 0 deletions b/‎Data Structure.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Data Visualization.ipynb
Lines changed: 1 addition & 0 deletions b/‎Data Visualization.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Deep Learning Assignment.ipynb
Lines changed: 1 addition & 0 deletions b/‎Deep Learning Assignment.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Documet retrieval using clustering.ipynb
Lines changed: 1 addition & 0 deletions b/‎Documet retrieval using clustering.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Exploring Ensemble methods.ipynb
Lines changed: 1 addition & 0 deletions b/‎Exploring Ensemble methods.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Exploring precision and recall .ipynb
Lines changed: 1 addition & 0 deletions b/‎Exploring precision and recall .ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Final_dataset.ipynb
Lines changed: 1 addition & 0 deletions b/‎Final_dataset.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Gradient Boosted tree.ipynb
Lines changed: 1 addition & 0 deletions b/‎Gradient Boosted tree.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Handling Overfitting in decision tree.ipynb
Lines changed: 1 addition & 0 deletions b/‎Handling Overfitting in decision tree.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎House_prediction.ipynb
Lines changed: 1 addition & 0 deletions b/‎House_prediction.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Identifying safe loans with decision trees.ipynb
Lines changed: 1 addition & 0 deletions b/‎Identifying safe loans with decision trees.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Image Classification using Deep Learning.ipynb
Lines changed: 1 addition & 0 deletions b/‎Image Classification using Deep Learning.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Image Retrieval using Deep Features.ipynb
Lines changed: 1 addition & 0 deletions b/‎Image Retrieval using Deep Features.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Implementing LASSO using coordinate descent.ipynb
Lines changed: 1 addition & 0 deletions b/‎Implementing LASSO using coordinate descent.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Implementing Locality Sensitive Hashing from scratch.ipynb
Lines changed: 1 addition & 0 deletions b/‎Implementing Locality Sensitive Hashing from scratch.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Implementing binary decision trees.ipynb
Lines changed: 1 addition & 0 deletions b/‎Implementing binary decision trees.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Implementing logistic Regression from Scratch.ipynb
Lines changed: 1 addition & 0 deletions b/‎Implementing logistic Regression from Scratch.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎K-Nearest Neighbour for predicting house prices.ipynb
Lines changed: 1 addition & 0 deletions b/‎K-Nearest Neighbour for predicting house prices.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Logistic Regression with L2 regularization.ipynb
Lines changed: 1 addition & 0 deletions b/‎Logistic Regression with L2 regularization.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Logistic Regression_Linear Classifier.ipynb
Lines changed: 1 addition & 0 deletions b/‎Logistic Regression_Linear Classifier.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Mercari Price Suggestion Lightgbm.ipynb
Lines changed: 1 addition & 0 deletions b/‎Mercari Price Suggestion Lightgbm.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Multiple Regression Assignment 1.ipynb
Lines changed: 1 addition & 0 deletions b/‎Multiple Regression Assignment 1.ipynb
Lines changed: 1 addition & 0 deletions
diff --git a/‎Multiple Regression Assignment 2.ipynb
Lines changed: 1 addition & 0 deletions b/‎Multiple Regression Assignment 2.ipynb
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Implementing Locality Sensitive Hashing from scratch.ipynb","provenance":[],"mount_file_id":"1mWqE5OC7qPU-hZvJVb2vIK49j1IMaDR8","authorship_tag":"ABX9TyP02GKi9OiFfKU1O61t5yEp"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"jYg_PVic64sp"},"source":["Locality Sensitive Hashing (LSH) provides for a fast, efficient approximate nearest neighbor search. The algorithm scales well with respect to the number of data points as well as dimensions.\n","\n","In this assignment, you will\n","\n","    Implement the LSH algorithm for approximate nearest neighbor search\n","    Examine the accuracy for different documents by comparing against brute force search, and also contrast runtimes\n","    Explore the role of the algorithm’s tuning parameters in the accuracy of the method"]},{"cell_type":"code","metadata":{"id":"vymX197B6WOp"},"source":["pip install turicreate"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"R0jeFTYV6v9m"},"source":["from __future__ import print_function # to conform python 2.x print to python 3.x\n","import numpy as np\n","import turicreate\n","from scipy.sparse import csr_matrix\n","from sklearn.metrics.pairwise import pairwise_distances\n","import time\n","from copy import copy\n","import matplotlib.pyplot as plt\n","%matplotlib inline\n","\n","'''compute norm of a sparse vector\n","   Thanks to: Jaiyam Sharma'''\n","def norm(x):\n","    sum_sq=x.dot(x.T)\n","    norm=np.sqrt(sum_sq)\n","    return(norm)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"vDSPBIbF73tq"},"source":["\n","\n","Load in the Wikipedia dataset"]},{"cell_type":"code","metadata":{"id":"wKVOH1VG74Ty"},"source":["wiki = turicreate.SFrame('/content/drive/My Drive/people_wiki.sframe/')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"WdXbLae28DUg"},"source":["wiki = wiki.add_row_number()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"8KWt0Bll_JfI"},"source":["Extract TF-IDF matrix"]},{"cell_type":"code","metadata":{"id":"XCEXclSU_KFw"},"source":["wiki['tf_idf'] = turicreate.text_analytics.tf_idf(wiki['text'])\n","wiki.head()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"QJMt5QaL_MEM"},"source":["For the remainder of the assignment, we will use sparse matrices. Sparse matrices are matrices) that have a small number of nonzero entries. A good data structure for sparse matrices would only store the nonzero entries to save space and speed up computation. SciPy provides a highly-optimized library for sparse matrices. Many matrix operations available for NumPy arrays are also available for SciPy sparse matrices.We first convert the TF-IDF column (in dictionary format) into the SciPy sparse matrix format."]},{"cell_type":"code","metadata":{"id":"OVr4KVGP_PnF"},"source":["def sframe_to_scipy(x, column_name):\n","    '''\n","    Convert a dictionary column of an SFrame into a sparse matrix format where\n","    each (row_id, column_id, value) triple corresponds to the value of\n","    x[row_id][column_id], where column_id is a key in the dictionary.\n","       \n","    Example\n","    >>> sparse_matrix, map_key_to_index = sframe_to_scipy(sframe, column_name)\n","    '''\n","    \n","    'The chosen column must be dict type, representing sparse data.'\n","    \n","    # Stack will transform x to have a row for each unique (row, key) pair.\n","    x = x.stack(column_name, ['feature', 'value'])\n","    \n","    \n","    # Map feature words to integers \n","    unique_words = sorted(x['feature'].unique())\n","    mapping = {word:i for i, word in enumerate(unique_words)}\n","    x['feature_id'] = x['feature'].apply(lambda x: mapping[x])\n","    \n","    # Create numpy arrays that contain the data for the sparse matrix.\n","    row_id = np.array(x['id'])\n","    col_id = np.array(x['feature_id'])\n","    data = np.array(x['value'])\n","    \n","    width = x['id'].max() + 1\n","    height = x['feature_id'].max() + 1\n","    \n","    # Create a sparse matrix.\n","    mat = csr_matrix((data, (row_id, col_id)), shape=(width, height))\n","    return mat, mapping"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"2HT97059GW4H"},"source":["%%time\n","corpus, mapping = sframe_to_scipy(wiki, 'tf_idf')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"MgJ6nehBHpKX"},"source":["assert corpus.shape == (59071, 547979)\n","print('Check passed correctly!')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2rXR5uEEJ2Ek"},"source":["Train an LSH model"]},{"cell_type":"markdown","metadata":{"id":"Z7pZs2mwJ4Ge"},"source":["The idea behind LSH is to translate the sign of our tf-idf scores into a binary index (1 or 0) by using seeing if our score falls above or below a randomly defined line. This link is helpful for understanding LSH and our code in more detail.\n","\n","LSH performs an efficient neighbor search by randomly partitioning all reference data points into different bins. Today we will build a popular variant of LSH known as random binary projection, which approximates cosine distance. There are other variants we could use for other choices of distance metrics.\n","\n","The first step is to generate a collection of random vectors from the standard Gaussian distribution."]},{"cell_type":"code","metadata":{"id":"KdgJJsN2J0gG"},"source":["def generate_random_vectors(dim, n_vectors):\n","    return np.random.randn(dim, n_vectors)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"VeWq0aFFKCKl"},"source":["generate_random_vectors(n_vectors=3, dim=5)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"uFr9UI-8VVIJ"},"source":["\n","We now generate random vectors of the same dimensionality as our vocubulary size (547979). Each vector can be used to compute one bit in the bin encoding. We generate 16 vectors, leading to a 16-bit encoding of the bin index for each document."]},{"cell_type":"code","metadata":{"id":"CxdPUhX5OCyU"},"source":["# Generate 16 random vectors of dimension 547979\n","np.random.seed(0)\n","n_vectors = 16\n","random_vectors = generate_random_vectors(corpus.shape[1], n_vectors)\n","random_vectors.shape"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"5wO_BxZFVX72"},"source":["Next, we partition data points into bins. Instead of using explicit loops, we'd like to utilize matrix operations for greater efficiency. Let's walk through the construction step by step.\n","\n","We'd like to decide which bin document 0 should go. Since 16 random vectors were generated in the previous cell, we have 16 bits to represent the bin index. The first bit is given by the sign of the dot product between the first random vector and the document's TF-IDF vector."]},{"cell_type":"code","metadata":{"id":"hqGiNRa5O5pl"},"source":["sample = corpus[0] # vector of tf-idf values for document 0\n","bin_indices_bits = sample.dot(random_vectors[:,0]) >= 0\n","bin_indices_bits"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ZvNXe9LWVegY"},"source":["Similarly, the second bit is computed as the sign of the dot product between the second random vector and the document vector."]},{"cell_type":"markdown","metadata":{"id":"Pq3aHz7YVk2Z"},"source":["We can compute all of the bin index bits at once as follows. Note the absence of the explicit for loop over the 16 vectors. Matrix operations let us batch dot-product computation in a highly efficent manner, unlike the for loop construction. Given the relative inefficiency of loops in Python, the advantage of matrix operations is even greater."]},{"cell_type":"code","metadata":{"id":"lAjvkxY_Sc3g"},"source":["sample.dot(random_vectors) >= 0 # should return an array of 16 True/False bits"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"rPSOMafPVheH"},"source":["All documents that obtain exactly this vector will be assigned to the same bin. We'd like to repeat the identical operation on all documents in the Wikipedia dataset and compute the corresponding bin indices. Again, we use matrix operations so that no explicit loop is needed."]},{"cell_type":"code","metadata":{"id":"-ED-ZgVuSnGQ"},"source":["np.array(sample.dot(random_vectors) >= 0, dtype=int) # display index bits in 0/1's"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"fQnaliGCS1cA"},"source":["corpus.dot(random_vectors) >= 0 # compute bit indices of ALL documents"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"h4F8uM1VVr42"},"source":["We're almost done! To make it convenient to refer to individual bins, we convert each binary bin index into a single integer:\n","\n","Bin index                      integer\n","[0,0,0,0,0,0,0,0,0,0,0,0]   => 0\n","[0,0,0,0,0,0,0,0,0,0,0,1]   => 1\n","[0,0,0,0,0,0,0,0,0,0,1,0]   => 2\n","[0,0,0,0,0,0,0,0,0,0,1,1]   => 3\n","...\n","[1,1,1,1,1,1,1,1,1,1,0,0]   => 65532\n","[1,1,1,1,1,1,1,1,1,1,0,1]   => 65533\n","[1,1,1,1,1,1,1,1,1,1,1,0]   => 65534\n","[1,1,1,1,1,1,1,1,1,1,1,1]   => 65535 (= 2^16-1)\n","By the rules of binary number representation, we just need to compute the dot product between the document vector and the vector consisting of powers of 2:"]},{"cell_type":"code","metadata":{"id":"oTsR8cc5S2og"},"source":["index_bits = (sample.dot(random_vectors) >= 0)\n","powers_of_two = (1 << np.arange(15, -1, -1))\n","print(index_bits)\n","print(powers_of_two)\n","print(index_bits.dot(powers_of_two))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"OZJJzRyIV5xX"},"source":["index_bits = sample.dot(random_vectors) >= 0\n","index_bits.dot(powers_of_two)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"V2dzy7AyXNUD"},"source":["This array gives us the integer index of the bins for all documents.\n","\n","Now we are ready to complete the following function. Given the integer bin indices for the documents, you should compile a list of document IDs that belong to each bin. Since a list is to be maintained for each unique bin index, a dictionary of lists is used.\n","\n","Compute the integer bin indices. This step is already completed.\n","For each document in the dataset, do the following:\n","Get the integer bin index for the document.\n","Fetch the list of document ids associated with the bin; if no list yet exists for this bin, assign the bin an empty list.\n","Add the document id to the end of the list."]},{"cell_type":"code","metadata":{"id":"NOna7_LqXLTM"},"source":["from collections import defaultdict \n","\n","def train_lsh(data, n_vectors, seed=None):    \n","    if seed is not None:\n","        np.random.seed(seed)\n","\n","    dim = data.shape[1]\n","    random_vectors = generate_random_vectors(dim, n_vectors)  \n","\n","    # Partition data points into bins,\n","    # and encode bin index bits into integers\n","    bin_indices_bits = data.dot(random_vectors) >= 0\n","    powers_of_two = 1 << np.arange(n_vectors - 1, -1, step=-1)\n","    bin_indices = bin_indices_bits.dot(powers_of_two)\n","\n","    # Update `table` so that `table[i]` is the list of document ids with bin index equal to i\n","    table = defaultdict(list)\n","    for idx, bin_index in enumerate(bin_indices):\n","        # Fetch the list of document ids associated with the bin and add the document id to the end.\n","        # data_index: document ids\n","        # append() will add a list of document ids to table dict() with key as bin_index\n","        table[bin_index].append(idx)\n","    \n","    # Note that we're storing the bin_indices here\n","    # so we can do some ad-hoc checking with it,\n","    # this isn't actually required\n","    model = {'data': data,\n","             'table': table,\n","             'random_vectors': random_vectors,\n","             'bin_indices': bin_indices,\n","             'bin_indices_bits': bin_indices_bits}\n","    return model"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"QNchpv3pZZ10"},"source":["def compare_bits(model, id_1, id_2):\n","    bits1 = model['bin_indices_bits'][id_1]\n","    bits2 = model['bin_indices_bits'][id_2]\n","    print('Number of agreed bits: ', np.sum(bits1 == bits2))\n","    return np.sum(bits1 == bits2)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"TUTqlIkNZdFt"},"source":["model = train_lsh(corpus, 16, seed=475)\n","obama_id = wiki[wiki['name'] == 'Barack Obama']['id'][0]\n","biden_id = wiki[wiki['name'] == 'Joe Biden']['id'][0]\n","similariy = compare_bits(model, obama_id, biden_id)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"zJ8TG2oMZ7BC"},"source":[""],"execution_count":null,"outputs":[]}]}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Implementing Locality Sensitive Hashing from scratch.ipynb","provenance":[],"mount_file_id":"1mWqE5OC7qPU-hZvJVb2vIK49j1IMaDR8","authorship_tag":"ABX9TyP02GKi9OiFfKU1O61t5yEp"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"jYg_PVic64sp"},"source":["Locality Sensitive Hashing (LSH) provides for a fast, efficient approximate nearest neighbor search. The algorithm scales well with respect to the number of data points as well as dimensions.\n","\n","In this assignment, you will\n","\n"," Implement the LSH algorithm for approximate nearest neighbor search\n"," Examine the accuracy for different documents by comparing against brute force search, and also contrast runtimes\n"," Explore the role of the algorithm’s tuning parameters in the accuracy of the method"]},{"cell_type":"code","metadata":{"id":"vymX197B6WOp"},"source":["pip install turicreate"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"R0jeFTYV6v9m"},"source":["from __future__ import print_function # to conform python 2.x print to python 3.x\n","import numpy as np\n","import turicreate\n","from scipy.sparse import csr_matrix\n","from sklearn.metrics.pairwise import pairwise_distances\n","import time\n","from copy import copy\n","import matplotlib.pyplot as plt\n","%matplotlib inline\n","\n","'''compute norm of a sparse vector\n"," Thanks to: Jaiyam Sharma'''\n","def norm(x):\n"," sum_sq=x.dot(x.T)\n"," norm=np.sqrt(sum_sq)\n"," return(norm)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"vDSPBIbF73tq"},"source":["\n","\n","Load in the Wikipedia dataset"]},{"cell_type":"code","metadata":{"id":"wKVOH1VG74Ty"},"source":["wiki = turicreate.SFrame('/content/drive/My Drive/people_wiki.sframe/')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"WdXbLae28DUg"},"source":["wiki = wiki.add_row_number()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"8KWt0Bll_JfI"},"source":["Extract TF-IDF matrix"]},{"cell_type":"code","metadata":{"id":"XCEXclSU_KFw"},"source":["wiki['tf_idf'] = turicreate.text_analytics.tf_idf(wiki['text'])\n","wiki.head()"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"QJMt5QaL_MEM"},"source":["For the remainder of the assignment, we will use sparse matrices. Sparse matrices are matrices) that have a small number of nonzero entries. A good data structure for sparse matrices would only store the nonzero entries to save space and speed up computation. SciPy provides a highly-optimized library for sparse matrices. Many matrix operations available for NumPy arrays are also available for SciPy sparse matrices.We first convert the TF-IDF column (in dictionary format) into the SciPy sparse matrix format."]},{"cell_type":"code","metadata":{"id":"OVr4KVGP_PnF"},"source":["def sframe_to_scipy(x, column_name):\n"," '''\n"," Convert a dictionary column of an SFrame into a sparse matrix format where\n"," each (row_id, column_id, value) triple corresponds to the value of\n"," x[row_id][column_id], where column_id is a key in the dictionary.\n"," \n"," Example\n"," >>> sparse_matrix, map_key_to_index = sframe_to_scipy(sframe, column_name)\n"," '''\n"," \n"," 'The chosen column must be dict type, representing sparse data.'\n"," \n"," # Stack will transform x to have a row for each unique (row, key) pair.\n"," x = x.stack(column_name, ['feature', 'value'])\n"," \n"," \n"," # Map feature words to integers \n"," unique_words = sorted(x['feature'].unique())\n"," mapping = {word:i for i, word in enumerate(unique_words)}\n"," x['feature_id'] = x['feature'].apply(lambda x: mapping[x])\n"," \n"," # Create numpy arrays that contain the data for the sparse matrix.\n"," row_id = np.array(x['id'])\n"," col_id = np.array(x['feature_id'])\n"," data = np.array(x['value'])\n"," \n"," width = x['id'].max() + 1\n"," height = x['feature_id'].max() + 1\n"," \n"," # Create a sparse matrix.\n"," mat = csr_matrix((data, (row_id, col_id)), shape=(width, height))\n"," return mat, mapping"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"2HT97059GW4H"},"source":["%%time\n","corpus, mapping = sframe_to_scipy(wiki, 'tf_idf')"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"MgJ6nehBHpKX"},"source":["assert corpus.shape == (59071, 547979)\n","print('Check passed correctly!')"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2rXR5uEEJ2Ek"},"source":["Train an LSH model"]},{"cell_type":"markdown","metadata":{"id":"Z7pZs2mwJ4Ge"},"source":["The idea behind LSH is to translate the sign of our tf-idf scores into a binary index (1 or 0) by using seeing if our score falls above or below a randomly defined line. This link is helpful for understanding LSH and our code in more detail.\n","\n","LSH performs an efficient neighbor search by randomly partitioning all reference data points into different bins. Today we will build a popular variant of LSH known as random binary projection, which approximates cosine distance. There are other variants we could use for other choices of distance metrics.\n","\n","The first step is to generate a collection of random vectors from the standard Gaussian distribution."]},{"cell_type":"code","metadata":{"id":"KdgJJsN2J0gG"},"source":["def generate_random_vectors(dim, n_vectors):\n"," return np.random.randn(dim, n_vectors)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"VeWq0aFFKCKl"},"source":["generate_random_vectors(n_vectors=3, dim=5)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"uFr9UI-8VVIJ"},"source":["\n","We now generate random vectors of the same dimensionality as our vocubulary size (547979). Each vector can be used to compute one bit in the bin encoding. We generate 16 vectors, leading to a 16-bit encoding of the bin index for each document."]},{"cell_type":"code","metadata":{"id":"CxdPUhX5OCyU"},"source":["# Generate 16 random vectors of dimension 547979\n","np.random.seed(0)\n","n_vectors = 16\n","random_vectors = generate_random_vectors(corpus.shape[1], n_vectors)\n","random_vectors.shape"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"5wO_BxZFVX72"},"source":["Next, we partition data points into bins. Instead of using explicit loops, we'd like to utilize matrix operations for greater efficiency. Let's walk through the construction step by step.\n","\n","We'd like to decide which bin document 0 should go. Since 16 random vectors were generated in the previous cell, we have 16 bits to represent the bin index. The first bit is given by the sign of the dot product between the first random vector and the document's TF-IDF vector."]},{"cell_type":"code","metadata":{"id":"hqGiNRa5O5pl"},"source":["sample = corpus[0] # vector of tf-idf values for document 0\n","bin_indices_bits = sample.dot(random_vectors[:,0]) >= 0\n","bin_indices_bits"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ZvNXe9LWVegY"},"source":["Similarly, the second bit is computed as the sign of the dot product between the second random vector and the document vector."]},{"cell_type":"markdown","metadata":{"id":"Pq3aHz7YVk2Z"},"source":["We can compute all of the bin index bits at once as follows. Note the absence of the explicit for loop over the 16 vectors. Matrix operations let us batch dot-product computation in a highly efficent manner, unlike the for loop construction. Given the relative inefficiency of loops in Python, the advantage of matrix operations is even greater."]},{"cell_type":"code","metadata":{"id":"lAjvkxY_Sc3g"},"source":["sample.dot(random_vectors) >= 0 # should return an array of 16 True/False bits"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"rPSOMafPVheH"},"source":["All documents that obtain exactly this vector will be assigned to the same bin. We'd like to repeat the identical operation on all documents in the Wikipedia dataset and compute the corresponding bin indices. Again, we use matrix operations so that no explicit loop is needed."]},{"cell_type":"code","metadata":{"id":"-ED-ZgVuSnGQ"},"source":["np.array(sample.dot(random_vectors) >= 0, dtype=int) # display index bits in 0/1's"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"fQnaliGCS1cA"},"source":["corpus.dot(random_vectors) >= 0 # compute bit indices of ALL documents"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"h4F8uM1VVr42"},"source":["We're almost done! To make it convenient to refer to individual bins, we convert each binary bin index into a single integer:\n","\n","Bin index integer\n","[0,0,0,0,0,0,0,0,0,0,0,0] => 0\n","[0,0,0,0,0,0,0,0,0,0,0,1] => 1\n","[0,0,0,0,0,0,0,0,0,0,1,0] => 2\n","[0,0,0,0,0,0,0,0,0,0,1,1] => 3\n","...\n","[1,1,1,1,1,1,1,1,1,1,0,0] => 65532\n","[1,1,1,1,1,1,1,1,1,1,0,1] => 65533\n","[1,1,1,1,1,1,1,1,1,1,1,0] => 65534\n","[1,1,1,1,1,1,1,1,1,1,1,1] => 65535 (= 2^16-1)\n","By the rules of binary number representation, we just need to compute the dot product between the document vector and the vector consisting of powers of 2:"]},{"cell_type":"code","metadata":{"id":"oTsR8cc5S2og"},"source":["index_bits = (sample.dot(random_vectors) >= 0)\n","powers_of_two = (1 << np.arange(15, -1, -1))\n","print(index_bits)\n","print(powers_of_two)\n","print(index_bits.dot(powers_of_two))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"OZJJzRyIV5xX"},"source":["index_bits = sample.dot(random_vectors) >= 0\n","index_bits.dot(powers_of_two)"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"V2dzy7AyXNUD"},"source":["This array gives us the integer index of the bins for all documents.\n","\n","Now we are ready to complete the following function. Given the integer bin indices for the documents, you should compile a list of document IDs that belong to each bin. Since a list is to be maintained for each unique bin index, a dictionary of lists is used.\n","\n","Compute the integer bin indices. This step is already completed.\n","For each document in the dataset, do the following:\n","Get the integer bin index for the document.\n","Fetch the list of document ids associated with the bin; if no list yet exists for this bin, assign the bin an empty list.\n","Add the document id to the end of the list."]},{"cell_type":"code","metadata":{"id":"NOna7_LqXLTM"},"source":["from collections import defaultdict \n","\n","def train_lsh(data, n_vectors, seed=None): \n"," if seed is not None:\n"," np.random.seed(seed)\n","\n"," dim = data.shape[1]\n"," random_vectors = generate_random_vectors(dim, n_vectors) \n","\n"," # Partition data points into bins,\n"," # and encode bin index bits into integers\n"," bin_indices_bits = data.dot(random_vectors) >= 0\n"," powers_of_two = 1 << np.arange(n_vectors - 1, -1, step=-1)\n"," bin_indices = bin_indices_bits.dot(powers_of_two)\n","\n"," # Update `table` so that `table[i]` is the list of document ids with bin index equal to i\n"," table = defaultdict(list)\n"," for idx, bin_index in enumerate(bin_indices):\n"," # Fetch the list of document ids associated with the bin and add the document id to the end.\n"," # data_index: document ids\n"," # append() will add a list of document ids to table dict() with key as bin_index\n"," table[bin_index].append(idx)\n"," \n"," # Note that we're storing the bin_indices here\n"," # so we can do some ad-hoc checking with it,\n"," # this isn't actually required\n"," model = {'data': data,\n"," 'table': table,\n"," 'random_vectors': random_vectors,\n"," 'bin_indices': bin_indices,\n"," 'bin_indices_bits': bin_indices_bits}\n"," return model"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"QNchpv3pZZ10"},"source":["def compare_bits(model, id_1, id_2):\n"," bits1 = model['bin_indices_bits'][id_1]\n"," bits2 = model['bin_indices_bits'][id_2]\n"," print('Number of agreed bits: ', np.sum(bits1 == bits2))\n"," return np.sum(bits1 == bits2)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"TUTqlIkNZdFt"},"source":["model = train_lsh(corpus, 16, seed=475)\n","obama_id = wiki[wiki['name'] == 'Barack Obama']['id'][0]\n","biden_id = wiki[wiki['name'] == 'Joe Biden']['id'][0]\n","similariy = compare_bits(model, obama_id, biden_id)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"zJ8TG2oMZ7BC"},"source":[""],"execution_count":null,"outputs":[]}]}