Skip to content

Commit d161cc3

Browse files
committed
first commit
0 parents  commit d161cc3

File tree

7 files changed

+329
-0
lines changed

7 files changed

+329
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

README.md

Whitespace-only changes.

data.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from itertools import chain
2+
import numpy as np
3+
4+
5+
def tokenize(example, tokenizer, label2id, max_length):
6+
# rebuild text from tokens
7+
text = []
8+
labels = []
9+
10+
for t, l, ws in zip(
11+
example["tokens"], example["provided_labels"], example["trailing_whitespace"]
12+
):
13+
text.append(t)
14+
labels.extend([l] * len(t))
15+
16+
if ws:
17+
text.append(" ")
18+
labels.append("O")
19+
20+
# actual tokenization
21+
tokenized = tokenizer(
22+
"".join(text), return_offsets_mapping=True, max_length=max_length
23+
)
24+
25+
labels = np.array(labels)
26+
27+
text = "".join(text)
28+
token_labels = []
29+
30+
for start_idx, end_idx in tokenized.offset_mapping:
31+
# CLS token
32+
if start_idx == 0 and end_idx == 0:
33+
token_labels.append(label2id["O"])
34+
continue
35+
36+
# case when token starts with whitespace
37+
if text[start_idx].isspace():
38+
start_idx += 1
39+
40+
token_labels.append(label2id[labels[start_idx]])
41+
42+
length = len(tokenized.input_ids)
43+
44+
return {**tokenized, "labels": token_labels, "length": length}

data/.gitkeep

Whitespace-only changes.

inference.py

+118
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
from datasets import Dataset
2+
from transformers import (
3+
AutoTokenizer,
4+
AutoModelForTokenClassification,
5+
DataCollatorForTokenClassification,
6+
Trainer,
7+
TrainingArguments,
8+
)
9+
from pathlib import Path
10+
11+
import pandas as pd
12+
import numpy as np
13+
import json
14+
15+
INFERENCE_MAX_LENGTH = 256
16+
model_path = "output/albert-base-v2_128"
17+
18+
19+
def tokenize(example, tokenizer):
20+
text = []
21+
token_map = []
22+
idx = 0
23+
24+
for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
25+
text.append(t)
26+
token_map.extend([idx] * len(t))
27+
if ws:
28+
text.append(" ")
29+
token_map.append(-1)
30+
idx += 1
31+
32+
tokenized = tokenizer(
33+
"".join(text),
34+
return_offsets_mapping=True,
35+
truncation=True,
36+
max_length=INFERENCE_MAX_LENGTH,
37+
)
38+
39+
return {
40+
**tokenized,
41+
"token_map": token_map,
42+
}
43+
44+
45+
data = json.load(open("data/test.json"))
46+
47+
ds = Dataset.from_dict(
48+
{
49+
"full_text": [x["full_text"] for x in data],
50+
"document": [x["document"] for x in data],
51+
"tokens": [x["tokens"] for x in data],
52+
"trailing_whitespace": [x["trailing_whitespace"] for x in data],
53+
}
54+
)
55+
56+
tokenizer = AutoTokenizer.from_pretrained(model_path)
57+
ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)
58+
59+
60+
model = AutoModelForTokenClassification.from_pretrained(model_path)
61+
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
62+
args = TrainingArguments(".", per_device_eval_batch_size=1, report_to="none",)
63+
64+
trainer = Trainer(model=model, args=args, data_collator=collator, tokenizer=tokenizer,)
65+
66+
predictions = trainer.predict(ds).predictions
67+
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
68+
predictions.shape[0], predictions.shape[1], 1
69+
)
70+
71+
config = json.load(open(Path(model_path) / "config.json"))
72+
id2label = config["id2label"]
73+
preds = predictions.argmax(-1)
74+
preds_without_O = pred_softmax[:, :, :12].argmax(-1)
75+
O_preds = pred_softmax[:, :, 12]
76+
77+
threshold = 0.9
78+
preds_final = np.where(O_preds < threshold, preds_without_O, preds)
79+
80+
81+
triplets = []
82+
document, token, label, token_str = [], [], [], []
83+
for p, token_map, offsets, tokens, doc in zip(
84+
preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]
85+
):
86+
for token_pred, (start_idx, end_idx) in zip(p, offsets):
87+
label_pred = id2label[str(token_pred)]
88+
if start_idx + end_idx == 0:
89+
continue
90+
if token_map[start_idx] == -1:
91+
start_idx += 1
92+
# ignore "\n\n"
93+
while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
94+
start_idx += 1
95+
if start_idx >= len(token_map):
96+
break
97+
98+
token_id = token_map[start_idx]
99+
100+
# ignore "O" predictions and whitespace preds
101+
if label_pred != "O" and token_id != -1:
102+
triplet = (label_pred, token_id, tokens[token_id])
103+
if triplet not in triplets:
104+
document.append(doc)
105+
token.append(token_id)
106+
label.append(label_pred)
107+
token_str.append(tokens[token_id])
108+
triplets.append(triplet)
109+
110+
111+
df = pd.DataFrame(
112+
{"document": document, "token": token, "label": label, "token_str": token_str}
113+
)
114+
df["row_id"] = list(range(len(df)))
115+
# display(df.head(100))
116+
117+
118+
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

train.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
from itertools import chain
2+
from functools import partial
3+
4+
from transformers import (
5+
AutoTokenizer,
6+
AutoModelForTokenClassification,
7+
DataCollatorForTokenClassification,
8+
Trainer,
9+
TrainingArguments,
10+
)
11+
from datasets import Dataset
12+
13+
from data import tokenize
14+
from utils import compute_metrics
15+
16+
import json
17+
import numpy as np
18+
19+
20+
TRAINING_MODEL_PATH = "albert/albert-base-v2"
21+
TRAINING_MAX_LENGTH = 128
22+
OUTPUT_DIR = "output"
23+
24+
25+
data = json.load(open("data/train.json"))
26+
27+
# downsampling of negative examples
28+
p = [] # positive samples (contain relevant labels)
29+
n = (
30+
[]
31+
) # negative samples (presumably contain entities that are possibly wrongly classified as entity)
32+
for d in data:
33+
if any(np.array(d["labels"]) != "O"):
34+
p.append(d)
35+
else:
36+
n.append(d)
37+
print("original datapoints: ", len(data))
38+
39+
external = json.load(open("data/pii_dataset_fixed.json"))
40+
print("external datapoints: ", len(external))
41+
42+
moredata = json.load(open("data/moredata_dataset_fixed.json"))
43+
print("moredata datapoints: ", len(moredata))
44+
45+
data = moredata + external + p + n[: len(n) // 3]
46+
print("combined: ", len(data))
47+
48+
49+
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
50+
label2id = {l: i for i, l in enumerate(all_labels)}
51+
id2label = {v: k for k, v in label2id.items()}
52+
53+
target = [
54+
"B-EMAIL",
55+
"B-ID_NUM",
56+
"B-NAME_STUDENT",
57+
"B-PHONE_NUM",
58+
"B-STREET_ADDRESS",
59+
"B-URL_PERSONAL",
60+
"B-USERNAME",
61+
"I-ID_NUM",
62+
"I-NAME_STUDENT",
63+
"I-PHONE_NUM",
64+
"I-STREET_ADDRESS",
65+
"I-URL_PERSONAL",
66+
]
67+
print(id2label)
68+
69+
70+
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
71+
72+
ds = Dataset.from_dict(
73+
{
74+
"full_text": [x["full_text"] for x in data],
75+
"document": [str(x["document"]) for x in data],
76+
"tokens": [x["tokens"] for x in data],
77+
"trailing_whitespace": [x["trailing_whitespace"] for x in data],
78+
"provided_labels": [x["labels"] for x in data],
79+
}
80+
)
81+
ds = ds.map(
82+
tokenize,
83+
fn_kwargs={
84+
"tokenizer": tokenizer,
85+
"label2id": label2id,
86+
"max_length": TRAINING_MAX_LENGTH,
87+
},
88+
num_proc=3,
89+
)
90+
# ds = ds.class_encode_column("group")
91+
92+
93+
model = AutoModelForTokenClassification.from_pretrained(
94+
TRAINING_MODEL_PATH,
95+
num_labels=len(all_labels),
96+
id2label=id2label,
97+
label2id=label2id,
98+
ignore_mismatched_sizes=True,
99+
)
100+
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
101+
102+
103+
final_ds = ds.train_test_split(
104+
test_size=0.2, seed=42
105+
) # cannot use stratify_by_column='group'
106+
final_ds
107+
108+
109+
args = TrainingArguments(
110+
output_dir=OUTPUT_DIR,
111+
fp16=True,
112+
learning_rate=2e-5,
113+
num_train_epochs=3,
114+
per_device_train_batch_size=4,
115+
gradient_accumulation_steps=2,
116+
report_to="none",
117+
evaluation_strategy="no",
118+
do_eval=False,
119+
save_total_limit=1,
120+
logging_steps=20,
121+
lr_scheduler_type="cosine",
122+
metric_for_best_model="f1",
123+
greater_is_better=True,
124+
warmup_ratio=0.1,
125+
weight_decay=0.01,
126+
)
127+
128+
trainer = Trainer(
129+
model=model,
130+
args=args,
131+
train_dataset=ds,
132+
data_collator=collator,
133+
tokenizer=tokenizer,
134+
compute_metrics=partial(compute_metrics, all_labels=all_labels),
135+
)
136+
137+
138+
trainer.train()
139+
140+
141+
trainer.save_model("output/albert-base-v2_128")
142+
tokenizer.save_pretrained("output/albert-base-v2_128")

utils.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from seqeval.metrics import recall_score, precision_score
2+
3+
import numpy as np
4+
5+
6+
def compute_metrics(p, all_labels):
7+
predictions, labels = p
8+
predictions = np.argmax(predictions, axis=2)
9+
10+
# Remove ignored index (special tokens)
11+
true_predictions = [
12+
[all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
13+
for prediction, label in zip(predictions, labels)
14+
]
15+
true_labels = [
16+
[all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
17+
for prediction, label in zip(predictions, labels)
18+
]
19+
20+
recall = recall_score(true_labels, true_predictions)
21+
precision = precision_score(true_labels, true_predictions)
22+
f1_score = (1 + 5 * 5) * recall * precision / (5 * 5 * precision + recall)
23+
24+
results = {"recall": recall, "precision": precision, "f1": f1_score}
25+
return results

0 commit comments

Comments
 (0)