Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*__pycache__
.idea
checkpoints/
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,37 @@
# Ditto Upgrades

## Background

When I tried experimenting with the original version of the Ditto repo, I hit a couple of blockers, including:
- Requiring Microsoft Visual Studio 2019 tools in order to build Apex, which is currently unavailable to download for free. Because of that, I could not activate automatic mixed precision for training
- Limitations in the selection of pre-trained language models that I can fine-tune. For example, I could not load some of the newer pre-trained language models, like `microsoft/deberta-v3-small`
- Newer PyTorch versions natively have optimizers, including `AdamW` which is what is being used here, as well as an automatic mixed precision feature

## Main Changes

### Cross-Platform Compatibility Improvements
- Encode loaded data using UTF-8
- Normalize path separators for cross-platform compatibility (Windows/Linux/macOS)

### Modernized Dependencies to Use Native PyTorch Features
- Import `AdamW` from `torch.optim` instead of `transformers`
- Import `amp` (automatic mixed precision) from `torch.amp` instead of `apex`, and use gradient scaling by default for training if `amp` is activated

### Enhanced Mixed Precision Support
- Add option to use amp in model evaluation for faster inference on modern GPUs
- Update the training process so that amp is activated in evaluation step when activated in the training step

### Command-Line Interface Improvements
- Change command-line argument name from `fp16` to `amp` for clarity
- Add an explicit command-line argument for whether to use a GPU in training

### Environment and Dependencies
- Upgraded to Python 3.12.11
- Created `updated_requirements.txt` with newer library versions to support modern PyTorch features

### Project Maintenance
- Add a .gitignore file

# Ditto: Deep Entity Matching with Pre-Trained Language Models

*Update: a new light-weight version based on new versions of Transformers*
Expand Down
2 changes: 1 addition & 1 deletion ditto_light/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self,
if isinstance(path, list):
lines = path
else:
lines = open(path)
lines = open(path, encoding='utf-8')

for line in lines:
s1, s2, label = line.strip().split('\t')
Expand Down
79 changes: 54 additions & 25 deletions ditto_light/ditto.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@
import numpy as np
import sklearn.metrics as metrics
import argparse

import logging
from .dataset import DittoDataset
from torch.utils import data
from transformers import AutoModel, AdamW, get_linear_schedule_with_warmup
from torch.optim import AdamW
from transformers import AutoModel, get_linear_schedule_with_warmup
from tensorboardX import SummaryWriter
from apex import amp
from torch.amp import autocast, GradScaler

logging.basicConfig(level=logging.INFO)

lm_mp = {'roberta': 'roberta-base',
'distilbert': 'distilbert-base-uncased'}
Expand Down Expand Up @@ -63,13 +66,14 @@ def forward(self, x1, x2=None):
return self.fc(enc) # .squeeze() # .sigmoid()


def evaluate(model, iterator, threshold=None):
def evaluate(model, iterator, threshold=None, use_amp=False):
"""Evaluate a model on a validation/test dataset

Args:
model (DMModel): the EM model
iterator (Iterator): the valid/test dataset iterator
threshold (float, optional): the threshold on the 0-class
use_amp (bool, optional): whether to use automatic mixed precision

Returns:
float: the F1 score
Expand All @@ -79,10 +83,13 @@ def evaluate(model, iterator, threshold=None):
all_p = []
all_y = []
all_probs = []
device_type = 'cuda' if model.device == 'cuda' else 'cpu'
with torch.no_grad():
for batch in iterator:
x, y = batch
logits = model(x)
ctx = autocast(device_type=device_type, dtype=torch.float16, enabled=use_amp)
with ctx:
logits = model(x)
probs = logits.softmax(dim=1)[:, 1]
all_probs += probs.cpu().numpy().tolist()
all_y += y.cpu().numpy().tolist()
Expand All @@ -105,40 +112,51 @@ def evaluate(model, iterator, threshold=None):
return f1, best_th


def train_step(train_iter, model, optimizer, scheduler, hp):
def train_step(train_iter, model, optimizer, scheduler, scaler=None):
"""Perform a single training step

Args:
train_iter (Iterator): the train data loader
model (DMModel): the model
optimizer (Optimizer): the optimizer (Adam or AdamW)
scheduler (LRScheduler): learning rate scheduler
hp (Namespace): other hyper-parameters (e.g., fp16)
scaler (GradScaler): the gradient scaler

Returns:
None
"""
criterion = nn.CrossEntropyLoss()
use_autocast = scaler is not None

# criterion = nn.MSELoss()
for i, batch in enumerate(train_iter):
optimizer.zero_grad()

if len(batch) == 2:
x, y = batch
prediction = model(x)
ctx = autocast(device_type=model.device, dtype=torch.float16, enabled=use_autocast)
with ctx:
prediction = model(x)
loss = criterion(prediction, y.to(model.device))

else:
x1, x2, y = batch
prediction = model(x1, x2)

loss = criterion(prediction, y.to(model.device))
ctx = autocast(device_type=model.device, dtype=torch.float16, enabled=use_autocast)
with ctx:
prediction = model(x1, x2)
loss = criterion(prediction, y.to(model.device))

if use_autocast:
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()

if hp.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
optimizer.step()
optimizer.step()

scheduler.step()

if i % 10 == 0: # monitoring
print(f"step: {i}, loss: {loss.item()}")
del loss
Expand All @@ -153,7 +171,7 @@ def train(trainset, validset, testset, run_tag, hp):
testset (DittoDataset): the test set
run_tag (str): the tag of the run
hp (Namespace): Hyper-parameters (e.g., batch_size,
learning rate, fp16)
learning rate, amp)

Returns:
None
Expand All @@ -177,15 +195,25 @@ def train(trainset, validset, testset, run_tag, hp):
collate_fn=padder)

# initialize model, optimizer, and LR scheduler
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# if getattr(hp, 'use_gpu', False):
if hp.use_gpu:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
else:
device = 'cpu'

model = DittoModel(device=device,
lm=hp.lm,
alpha_aug=hp.alpha_aug)
model = model.cuda()
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=hp.lr)


scaler = None

if hp.amp and device == 'cuda':
scaler = GradScaler()


if hp.fp16:
model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
num_steps = (len(trainset) // hp.batch_size) * hp.n_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0,
Expand All @@ -198,24 +226,25 @@ def train(trainset, validset, testset, run_tag, hp):
for epoch in range(1, hp.n_epochs+1):
# train
model.train()
train_step(train_iter, model, optimizer, scheduler, hp)
train_step(train_iter, model, optimizer, scheduler, scaler=scaler)

# eval
model.eval()
dev_f1, th = evaluate(model, valid_iter)
test_f1 = evaluate(model, test_iter, threshold=th)
use_amp = hp.amp and device == 'cuda'
dev_f1, th = evaluate(model, valid_iter, use_amp=use_amp)
test_f1 = evaluate(model, test_iter, threshold=th, use_amp=use_amp)

if dev_f1 > best_dev_f1:
best_dev_f1 = dev_f1
best_test_f1 = test_f1
if hp.save_model:
# create the directory if not exist
directory = os.path.join(hp.logdir, hp.task)
directory = os.path.normpath(os.path.join(hp.logdir, hp.task))
if not os.path.exists(directory):
os.makedirs(directory)

# save the checkpoints for each component
ckpt_path = os.path.join(hp.logdir, hp.task, 'model.pt')
ckpt_path = os.path.normpath(os.path.join(hp.logdir, hp.task, f'model.pt'))
ckpt = {'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'scheduler': scheduler.state_dict(),
Expand Down
7 changes: 5 additions & 2 deletions ditto_light/knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ def transform_file(self, input_fn, overwrite=False):
str: the output file name
"""
out_fn = input_fn + '.dk'
# Normalize path separators for cross-platform compatibility
out_fn = os.path.normpath(out_fn)
input_fn = os.path.normpath(input_fn)
if not os.path.exists(out_fn) or \
os.stat(out_fn).st_size == 0 or overwrite:

with open(out_fn, 'w') as fout:
for line in open(input_fn):
with open(out_fn, 'w', encoding='utf-8') as fout:
for line in open(input_fn, encoding='utf-8'):
LL = line.split('\t')
if len(LL) == 3:
entry0 = self.transform(LL[0])
Expand Down
10 changes: 7 additions & 3 deletions ditto_light/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def build_index(self):
self.config['testset']]
content = []
for fn in fns:
with open(fn) as fin:
fn = os.path.normpath(fn) # Normalize path separators
with open(fn, encoding='utf-8') as fin:
for line in fin:
LL = line.split('\t')
if len(LL) > 2:
Expand Down Expand Up @@ -127,9 +128,12 @@ def transform_file(self, input_fn, max_len=256, overwrite=False):
str: the output file name
"""
out_fn = input_fn + '.su'
# Normalize path separators for cross-platform compatibility
out_fn = os.path.normpath(out_fn)
input_fn = os.path.normpath(input_fn)
if not os.path.exists(out_fn) or \
os.stat(out_fn).st_size == 0 or overwrite:
with open(out_fn, 'w') as fout:
for line in open(input_fn):
with open(out_fn, 'w', encoding='utf-8') as fout:
for line in open(input_fn, encoding='utf-8'):
fout.write(self.transform(line, max_len=max_len))
return out_fn
Loading