megagonlabs · Sridhar98 · Mar 5, 2024 · Apr 3, 2024 · Apr 4, 2024 · Apr 4, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "blocking/sentence-transformers"]
 	path = blocking/sentence-transformers
-	url = https://github.com/UKPLab/sentence-transformers
+	url = https://github.com/UKPLab/sentence-transformers.git
diff --git a/DeepBlocker_v1/LICENSE b/DeepBlocker_v1/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2021, Saravanan
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/DeepBlocker_v1/README.md b/DeepBlocker_v1/README.md
@@ -0,0 +1,75 @@
+# DeepBlocker
+
+DeepBlocker is a Python package for performing blocking for entity matching using deep learning. It provides functionalities for transforming tuples into embeddings customized for blocking. Given these tuple embeddings, DeepBlocker also provides various utilities to retrieve similar tuples and construct the candidate set efficiently. DeepBlocker is self-supervised and does not require any labeled data. DeepBlocker provides multiple instantiations for tuple embedding  and vector pairing for performing blocking. It is also modular and easily customizable. Each of the subcomponent is based on a pre-defined and intuitive API that allows altering and swapping up these components to achieve bespoke implementations.  
+
+# Paper and Data
+
+For details on the architecture of the models used, take a look at our paper
+[Deep Learning for Blocking in Entity Matching: A Design Space Exploration (VLDB '21)](http://vldb.org/pvldb/vol14/p2459-thirumuruganathan.pdf).
+
+All public datasets used in the paper can be downloaded from the [datasets page](https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md).
+
+We used fastText for word embedding that is [pre-trained on Wikipedia](https://fasttext.cc/docs/en/pretrained-vectors.html).
+
+
+# Quick Start: DeepBlocker in 30 seconds
+
+There are four main steps in using DeepBlocker:
+
+1. Load the relevant libraries
+
+```python
+import pandas as pd
+from deep_blocker import DeepBlocker
+from tuple_embedding_models import  AutoEncoderTupleEmbedding, CTTTupleEmbedding, HybridTupleEmbedding
+from vector_pairing_models import ExactTopKVectorPairing
+import blocking_utils
+```
+
+2. Data processing: Load the relevant datasets for blocking.
+
+```python
+left_df = pd.read_csv("left_table_csv_file_name")
+right_df = pd.read_csv("right_table_csv_file_name")
+```
+
+3. Instantiate the DeepBlocker with appropriate classes for tuple embedding and vector pairing models.
+
+```python
+tuple_embedding_model = AutoEncoderTupleEmbedding()
+topK_vector_pairing_model = ExactTopKVectorPairing(K=50)
+db = DeepBlocker(tuple_embedding_model, topK_vector_pairing_model)
+```
+
+4. Train the models and perform blocking of the tables. Report the accuracy.
+
+```python
+candidate_set_df = db.block_datasets(left_df, right_df, cols_to_block)
+golden_df = pd.read_csv("matches_csv_file_name")
+print(blocking_utils.compute_blocking_statistics(candidate_set_df, golden_df, left_df, right_df))
+```
+
+
+# Installation
+
+DeepBlocker relies on a set of external libraries that can be found in [requirements.txt](requirements.txt).
+You can install them as
+```
+pip install -r requirements.txt
+```
+
+# Tutorials
+
+We provide a [sample script](main.py) illustrating how DeepBlocker works for three major tuple embedding models -- AutoEncoder, CTT and Hybrid.
+
+
+
+
+# Support
+
+Please contact Saravanan Thirumuruganathan for any questions about the code.
+
+# The Team
+
+DeepBlocker was developed by QCRI and University of Wisconsin-Madison.
+For the entire list of contributors please refer the [DeepBlocker paper](http://vldb.org/pvldb/vol14/p2459-thirumuruganathan.pdf).
diff --git a/DeepBlocker_v1/__pycache__/blocking_utils.cpython-37.pyc b/DeepBlocker_v1/__pycache__/blocking_utils.cpython-37.pyc
diff --git a/DeepBlocker_v1/__pycache__/configurations.cpython-37.pyc b/DeepBlocker_v1/__pycache__/configurations.cpython-37.pyc
diff --git a/DeepBlocker_v1/__pycache__/deep_blocker.cpython-37.pyc b/DeepBlocker_v1/__pycache__/deep_blocker.cpython-37.pyc
diff --git a/DeepBlocker_v1/__pycache__/dl_models.cpython-37.pyc b/DeepBlocker_v1/__pycache__/dl_models.cpython-37.pyc
diff --git a/DeepBlocker_v1/__pycache__/tuple_embedding_models.cpython-37.pyc b/DeepBlocker_v1/__pycache__/tuple_embedding_models.cpython-37.pyc
diff --git a/DeepBlocker_v1/__pycache__/vector_pairing_models.cpython-37.pyc b/DeepBlocker_v1/__pycache__/vector_pairing_models.cpython-37.pyc
diff --git a/DeepBlocker_v1/blocking_utils.py b/DeepBlocker_v1/blocking_utils.py
@@ -0,0 +1,53 @@
+import pandas as pd
+
+def topK_neighbors_to_candidate_set(topK_neighbors):
+    #We create a data frame corresponding to topK neighbors.
+    # We are given a 2D matrix of the form 1: [a1, a2, a3], 2: [b1, b2, b3]
+    # where a1, a2, a3 are the top-3 neighbors for tuple 1 and so on.
+    # We will now create a two column DF fo the form (1, a1), (1, a2), (1, a3), (2, b1), (2, b2), (2, b3)
+    topK_df = pd.DataFrame(topK_neighbors)
+    topK_df["ltable_id"] = topK_df.index
+    melted_df = pd.melt(topK_df, id_vars=["ltable_id"])
+    melted_df["rtable_id"] = melted_df["value"]
+    candidate_set_df = melted_df[["ltable_id", "rtable_id"]]
+    return candidate_set_df
+
+
+#This accepts four inputs:
+# data frames for candidate set and ground truth matches
+# left and right data frames
+def compute_blocking_statistics(candidate_set_df, golden_df, left_df, right_df):
+    #Now we have two data frames with two columns ltable_id and rtable_id
+    # If we do an equi-join of these two data frames, we will get the matches that were in the top-K
+    print(candidate_set_df.head(),golden_df.head())
+    #merged_df = pd.merge(candidate_set_df, golden_df, on=['ltable_id', 'rtable_id'])
+    merged_df = pd.merge(candidate_set_df, golden_df, on=['ltable_id', 'rtable_id'])
+
+    left_num_tuples = len(left_df)
+    right_num_tuples = len(right_df)
+    statistics_dict = {
+        "left_num_tuples": left_num_tuples,
+        "right_num_tuples": right_num_tuples,
+        "recall": len(merged_df) / len(golden_df),
+        "cssr": len(candidate_set_df) / (left_num_tuples * right_num_tuples)
+        }
+
+    return statistics_dict
+
+
+#This function is useful when you download the preprocessed data from DeepMatcher dataset
+# and want to convert to matches format.
+#It loads the train/valid/test files, filters the duplicates,
+# and saves them to a new file called matches.csv
+def process_files(folder_root):
+    df1 = pd.read_csv(folder_root + "/train.csv")
+    df2 = pd.read_csv(folder_root + "/valid.csv")
+    df3 = pd.read_csv(folder_root + "/test.csv")
+
+    df1 = df1[df1["label"] == 1]
+    df2 = df2[df2["label"] == 1]
+    df3 = df3[df3["label"] == 1]
+
+    df = pd.concat([df1, df2, df3], ignore_index=True)
+
+    df[["ltable_id","rtable_id"]].to_csv(folder_root + "/matches.csv", header=True, index=False)
diff --git a/DeepBlocker_v1/configurations.py b/DeepBlocker_v1/configurations.py
@@ -0,0 +1,9 @@
+FASTTEXT_EMBEDDIG_PATH = "embedding/wiki.en.bin"
+#Dimension of the word embeddings.
+EMB_DIMENSION_SIZE = 300
+#Embedding size of AutoEncoder embedding
+AE_EMB_DIMENSION_SIZE = 150
+NUM_EPOCHS = 50 
+BATCH_SIZE = 32
+RANDOM_SEED = 1234
+LEARNING_RATE = 1e-3
diff --git a/DeepBlocker_v1/create_matches_csv.py b/DeepBlocker_v1/create_matches_csv.py
@@ -0,0 +1,27 @@
+import os
+import pandas as pd
+import csv
+
+data_root = os.getcwd() + '/data/er_magellan/Dirty/'
+
+for folder in os.listdir(data_root):
+    subdata_path = data_root + folder
+
+    df1 = pd.read_csv(os.path.join(subdata_path,'train.csv'))
+    df2 = pd.read_csv(os.path.join(subdata_path,'valid.csv'))
+    df3 = pd.read_csv(os.path.join(subdata_path,'test.csv'))
+
+    df1 = df1[df1['label'] == 1]
+    df2 = df2[df2['label'] == 1]
+    df3 = df3[df3['label'] == 1]
+
+    # Concatenate all dataframes
+    combined_df = pd.concat([df1, df2, df3], ignore_index=True)
+
+    # Keep only 'ltable_id' and 'rtable_id' columns
+    combined_df = combined_df[['ltable_id', 'rtable_id']]
+
+    # Write the combined dataframe to a new CSV file
+    combined_df.to_csv(subdata_path+'/matches.csv', index=False)
+
+
diff --git a/DeepBlocker_v1/deep_blocker.py b/DeepBlocker_v1/deep_blocker.py
@@ -0,0 +1,87 @@
+#GiG
+import numpy as np
+import pandas as pd
+import time
+from pathlib import Path
+import blocking_utils
+
+class DeepBlocker:
+    def __init__(self, tuple_embedding_model, vector_pairing_model):
+        self.tuple_embedding_model = tuple_embedding_model
+        self.vector_pairing_model = vector_pairing_model
+
+    def validate_columns(self):
+        #Assumption: id column is named as id
+        if "id" not in self.cols_to_block:
+            self.cols_to_block.append("id")
+        self.cols_to_block_without_id = [col for col in self.cols_to_block if col != "id"]
+
+        #Check if all required columns are in left_df
+        check = all([col in self.left_df.columns for col in self.cols_to_block])
+        if not check:
+            raise Exception("Not all columns in cols_to_block are present in the left dataset")
+
+        #Check if all required columns are in right_df
+        check = all([col in self.right_df.columns for col in self.cols_to_block])
+        if not check:
+            raise Exception("Not all columns in cols_to_block are present in the right dataset")
+
+
+    def preprocess_datasets(self):
+        self.left_df = self.left_df[self.cols_to_block]
+        self.right_df = self.right_df[self.cols_to_block]
+
+        self.left_df.fillna(' ', inplace=True)
+        self.right_df.fillna(' ', inplace=True)
+
+        self.left_df = self.left_df.astype(str)
+        self.right_df = self.right_df.astype(str)
+
+
+        self.left_df["_merged_text"] = self.left_df[self.cols_to_block_without_id].agg(' '.join, axis=1)
+        self.right_df["_merged_text"] = self.right_df[self.cols_to_block_without_id].agg(' '.join, axis=1)
+
+        #Drop the other columns
+        self.left_df = self.left_df.drop(columns=self.cols_to_block_without_id)
+        self.right_df = self.right_df.drop(columns=self.cols_to_block_without_id)
+
+
+    def block_datasets(self, left_df, right_df, cols_to_block):
+        self.left_df = left_df
+        self.right_df = right_df
+        self.cols_to_block = cols_to_block
+
+        print('cols to block',cols_to_block)
+        print('left df head',left_df.columns)
+        print('right df head',right_df.columns)
+
+        self.validate_columns()
+        self.preprocess_datasets()
+
+        print("Performing pre-processing for tuple embeddings ")
+        all_merged_text = pd.concat([self.left_df["_merged_text"], self.right_df["_merged_text"]], ignore_index=True)
+        self.tuple_embedding_model.preprocess(all_merged_text)
+
+        print('after preprocess')
+
+        start_time = time.time()
+
+        print("Obtaining tuple embeddings for left table")
+        self.left_tuple_embeddings = self.tuple_embedding_model.get_tuple_embedding(self.left_df["_merged_text"])
+        print("Obtaining tuple embeddings for right table")
+        self.right_tuple_embeddings = self.tuple_embedding_model.get_tuple_embedding(self.right_df["_merged_text"])
+
+
+        print("Indexing the embeddings from the right dataset")
+        self.vector_pairing_model.index(self.right_tuple_embeddings)
+
+        print("Querying the embeddings from left dataset")
+        topK_neighbors = self.vector_pairing_model.query(self.left_tuple_embeddings)
+
+        self.candidate_set_df = blocking_utils.topK_neighbors_to_candidate_set(topK_neighbors)
+
+        end_time = time.time()
+        time_elapsed = end_time - start_time
+        print('Time elapsed in seconds',time_elapsed)
+
+        return self.candidate_set_df