megagonlabs · amrabdelmotteleb · Nov 12, 2025 · Nov 12, 2025 · Nov 12, 2025 · Nov 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*__pycache__
+.idea
+checkpoints/
diff --git a/README.md b/README.md
@@ -1,3 +1,37 @@
+# Ditto Upgrades
+
+## Background
+
+When I tried experimenting with the original version of the Ditto repo, I hit a couple of blockers, including: 
+- Requiring Microsoft Visual Studio 2019 tools in order to build Apex, which is currently unavailable to download for free. Because of that, I could not activate automatic mixed precision for training 
+- Limitations in the selection of pre-trained language models that I can fine-tune. For example, I could not load some of the newer pre-trained language models, like `microsoft/deberta-v3-small`
+- Newer PyTorch versions natively have optimizers, including `AdamW` which is what is being used here, as well as an automatic mixed precision feature
+
+## Main Changes
+
+### Cross-Platform Compatibility Improvements
+- Encode loaded data using UTF-8
+- Normalize path separators for cross-platform compatibility (Windows/Linux/macOS)
+
+### Modernized Dependencies to Use Native PyTorch Features
+- Import `AdamW` from `torch.optim` instead of `transformers` 
+- Import `amp` (automatic mixed precision) from `torch.amp` instead of `apex`, and use gradient scaling by default for training if `amp` is activated
+
+### Enhanced Mixed Precision Support
+- Add option to use amp in model evaluation for faster inference on modern GPUs
+- Update the training process so that amp is activated in evaluation step when activated in the training step
+
+### Command-Line Interface Improvements
+- Change command-line argument name from `fp16` to `amp` for clarity
+- Add an explicit command-line argument for whether to use a GPU in training
+
+### Environment and Dependencies
+- Upgraded to Python 3.12.11
+- Created `updated_requirements.txt` with newer library versions to support modern PyTorch features
+
+### Project Maintenance
+- Add a .gitignore file
+
 # Ditto: Deep Entity Matching with Pre-Trained Language Models
 
 *Update: a new light-weight version based on new versions of Transformers*

diff --git a/ditto_light/dataset.py b/ditto_light/dataset.py
@@ -34,7 +34,7 @@ def __init__(self,
         if isinstance(path, list):
             lines = path
         else:
-            lines = open(path)
+            lines = open(path, encoding='utf-8')
 
         for line in lines:
             s1, s2, label = line.strip().split('\t')

diff --git a/ditto_light/ditto.py b/ditto_light/ditto.py
@@ -8,12 +8,15 @@
 import numpy as np
 import sklearn.metrics as metrics
 import argparse
-
+import logging 
 from .dataset import DittoDataset
 from torch.utils import data
-from transformers import AutoModel, AdamW, get_linear_schedule_with_warmup
+from torch.optim import AdamW
+from transformers import AutoModel, get_linear_schedule_with_warmup
 from tensorboardX import SummaryWriter
-from apex import amp
+from torch.amp import autocast, GradScaler 
+
+logging.basicConfig(level=logging.INFO)
 
 lm_mp = {'roberta': 'roberta-base',
          'distilbert': 'distilbert-base-uncased'}
@@ -63,13 +66,14 @@ def forward(self, x1, x2=None):
         return self.fc(enc) # .squeeze() # .sigmoid()
 
 
-def evaluate(model, iterator, threshold=None):
+def evaluate(model, iterator, threshold=None, use_amp=False):
     """Evaluate a model on a validation/test dataset
 
     Args:
         model (DMModel): the EM model
         iterator (Iterator): the valid/test dataset iterator
         threshold (float, optional): the threshold on the 0-class
+        use_amp (bool, optional): whether to use automatic mixed precision
 
     Returns:
         float: the F1 score
@@ -79,10 +83,13 @@ def evaluate(model, iterator, threshold=None):
     all_p = []
     all_y = []
     all_probs = []
+    device_type = 'cuda' if model.device == 'cuda' else 'cpu'
     with torch.no_grad():
         for batch in iterator:
             x, y = batch
-            logits = model(x)
+            ctx = autocast(device_type=device_type, dtype=torch.float16, enabled=use_amp)
+            with ctx:
+                logits = model(x)
             probs = logits.softmax(dim=1)[:, 1]
             all_probs += probs.cpu().numpy().tolist()
             all_y += y.cpu().numpy().tolist()
@@ -105,40 +112,51 @@ def evaluate(model, iterator, threshold=None):
         return f1, best_th
 
 
-def train_step(train_iter, model, optimizer, scheduler, hp):
+def train_step(train_iter, model, optimizer, scheduler, scaler=None):
     """Perform a single training step
 
     Args:
         train_iter (Iterator): the train data loader
         model (DMModel): the model
         optimizer (Optimizer): the optimizer (Adam or AdamW)
         scheduler (LRScheduler): learning rate scheduler
-        hp (Namespace): other hyper-parameters (e.g., fp16)
+        scaler (GradScaler): the gradient scaler
 
     Returns:
         None
     """
     criterion = nn.CrossEntropyLoss()
+    use_autocast = scaler is not None
+
     # criterion = nn.MSELoss()
     for i, batch in enumerate(train_iter):
         optimizer.zero_grad()
 
         if len(batch) == 2:
             x, y = batch
-            prediction = model(x)
+            ctx = autocast(device_type=model.device, dtype=torch.float16, enabled=use_autocast)
+            with ctx:
+                prediction = model(x)
+                loss = criterion(prediction, y.to(model.device))
+
         else:
             x1, x2, y = batch
-            prediction = model(x1, x2)
-
-        loss = criterion(prediction, y.to(model.device))
+            ctx = autocast(device_type=model.device, dtype=torch.float16, enabled=use_autocast)
+            with ctx:
+                prediction = model(x1, x2)
+                loss = criterion(prediction, y.to(model.device))
+
+        if use_autocast:
+            scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
 
-        if hp.fp16:
-            with amp.scale_loss(loss, optimizer) as scaled_loss:
-                scaled_loss.backward()
         else:
             loss.backward()
-        optimizer.step()
+            optimizer.step()
+
         scheduler.step()
+
         if i % 10 == 0: # monitoring
             print(f"step: {i}, loss: {loss.item()}")
         del loss
@@ -153,7 +171,7 @@ def train(trainset, validset, testset, run_tag, hp):
         testset (DittoDataset): the test set
         run_tag (str): the tag of the run
         hp (Namespace): Hyper-parameters (e.g., batch_size,
-                        learning rate, fp16)
+                        learning rate, amp)
 
     Returns:
         None
@@ -177,15 +195,25 @@ def train(trainset, validset, testset, run_tag, hp):
                                  collate_fn=padder)
 
     # initialize model, optimizer, and LR scheduler
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # if getattr(hp, 'use_gpu', False):
+    if hp.use_gpu:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    else:
+        device = 'cpu'
+
     model = DittoModel(device=device,
                        lm=hp.lm,
                        alpha_aug=hp.alpha_aug)
-    model = model.cuda()
+    model = model.to(device)
     optimizer = AdamW(model.parameters(), lr=hp.lr)
+
+
+    scaler = None 
+
+    if hp.amp and device == 'cuda':
+        scaler = GradScaler() 
+
 
-    if hp.fp16:
-        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
     num_steps = (len(trainset) // hp.batch_size) * hp.n_epochs
     scheduler = get_linear_schedule_with_warmup(optimizer,
                                                 num_warmup_steps=0,
@@ -198,24 +226,25 @@ def train(trainset, validset, testset, run_tag, hp):
     for epoch in range(1, hp.n_epochs+1):
         # train
         model.train()
-        train_step(train_iter, model, optimizer, scheduler, hp)
+        train_step(train_iter, model, optimizer, scheduler, scaler=scaler)
 
         # eval
         model.eval()
-        dev_f1, th = evaluate(model, valid_iter)
-        test_f1 = evaluate(model, test_iter, threshold=th)
+        use_amp = hp.amp and device == 'cuda'
+        dev_f1, th = evaluate(model, valid_iter, use_amp=use_amp)
+        test_f1 = evaluate(model, test_iter, threshold=th, use_amp=use_amp)
 
         if dev_f1 > best_dev_f1:
             best_dev_f1 = dev_f1
             best_test_f1 = test_f1
             if hp.save_model:
                 # create the directory if not exist
-                directory = os.path.join(hp.logdir, hp.task)
+                directory = os.path.normpath(os.path.join(hp.logdir, hp.task))
                 if not os.path.exists(directory):
                     os.makedirs(directory)
 
                 # save the checkpoints for each component
-                ckpt_path = os.path.join(hp.logdir, hp.task, 'model.pt')
+                ckpt_path = os.path.normpath(os.path.join(hp.logdir, hp.task, f'model.pt'))
                 ckpt = {'model': model.state_dict(),
                         'optimizer': optimizer.state_dict(),
                         'scheduler': scheduler.state_dict(),

diff --git a/ditto_light/knowledge.py b/ditto_light/knowledge.py
@@ -37,11 +37,14 @@ def transform_file(self, input_fn, overwrite=False):
             str: the output file name
         """
         out_fn = input_fn + '.dk'
+        # Normalize path separators for cross-platform compatibility
+        out_fn = os.path.normpath(out_fn)
+        input_fn = os.path.normpath(input_fn)
         if not os.path.exists(out_fn) or \
             os.stat(out_fn).st_size == 0 or overwrite:
 
-            with open(out_fn, 'w') as fout:
-                for line in open(input_fn):
+            with open(out_fn, 'w', encoding='utf-8') as fout:
+                for line in open(input_fn, encoding='utf-8'):
                     LL = line.split('\t')
                     if len(LL) == 3:
                         entry0 = self.transform(LL[0])

diff --git a/ditto_light/summarize.py b/ditto_light/summarize.py
@@ -40,7 +40,8 @@ def build_index(self):
                self.config['testset']]
         content = []
         for fn in fns:
-            with open(fn) as fin:
+            fn = os.path.normpath(fn)  # Normalize path separators
+            with open(fn, encoding='utf-8') as fin:
                 for line in fin:
                     LL = line.split('\t')
                     if len(LL) > 2:
@@ -127,9 +128,12 @@ def transform_file(self, input_fn, max_len=256, overwrite=False):
             str: the output file name
         """
         out_fn = input_fn + '.su'
+        # Normalize path separators for cross-platform compatibility
+        out_fn = os.path.normpath(out_fn)
+        input_fn = os.path.normpath(input_fn)
         if not os.path.exists(out_fn) or \
            os.stat(out_fn).st_size == 0 or overwrite:
-            with open(out_fn, 'w') as fout:
-                for line in open(input_fn):
+            with open(out_fn, 'w', encoding='utf-8') as fout:
+                for line in open(input_fn, encoding='utf-8'):
                     fout.write(self.transform(line, max_len=max_len))
         return out_fn