ginny100
diff --git a/‎.DS_Store
6 KB b/‎.DS_Store
6 KB
diff --git a/‎README.md b/‎README.md
diff --git a/‎data.py
+44 b/‎data.py
+44
diff --git a/‎data/.gitkeep b/‎data/.gitkeep
diff --git a/‎inference.py
+118 b/‎inference.py
+118
diff --git a/‎train.py
+142 b/‎train.py
+142
diff --git a/‎utils.py
+25 b/‎utils.py
+25
@@ -0,0 +1,44 @@
+from itertools import chain
+import numpy as np
+
+
+def tokenize(example, tokenizer, label2id, max_length):
+    # rebuild text from tokens
+    text = []
+    labels = []
+
+    for t, l, ws in zip(
+        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
+    ):
+        text.append(t)
+        labels.extend([l] * len(t))
+
+        if ws:
+            text.append(" ")
+            labels.append("O")
+
+    # actual tokenization
+    tokenized = tokenizer(
+        "".join(text), return_offsets_mapping=True, max_length=max_length
+    )
+
+    labels = np.array(labels)
+
+    text = "".join(text)
+    token_labels = []
+
+    for start_idx, end_idx in tokenized.offset_mapping:
+        # CLS token
+        if start_idx == 0 and end_idx == 0:
+            token_labels.append(label2id["O"])
+            continue
+
+        # case when token starts with whitespace
+        if text[start_idx].isspace():
+            start_idx += 1
+
+        token_labels.append(label2id[labels[start_idx]])
+
+    length = len(tokenized.input_ids)
+
+    return {**tokenized, "labels": token_labels, "length": length}
@@ -0,0 +1,118 @@
+from datasets import Dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForTokenClassification,
+    DataCollatorForTokenClassification,
+    Trainer,
+    TrainingArguments,
+)
+from pathlib import Path
+
+import pandas as pd
+import numpy as np
+import json
+
+INFERENCE_MAX_LENGTH = 256
+model_path = "output/albert-base-v2_128"
+
+
+def tokenize(example, tokenizer):
+    text = []
+    token_map = []
+    idx = 0
+
+    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
+        text.append(t)
+        token_map.extend([idx] * len(t))
+        if ws:
+            text.append(" ")
+            token_map.append(-1)
+        idx += 1
+
+    tokenized = tokenizer(
+        "".join(text),
+        return_offsets_mapping=True,
+        truncation=True,
+        max_length=INFERENCE_MAX_LENGTH,
+    )
+
+    return {
+        **tokenized,
+        "token_map": token_map,
+    }
+
+
+data = json.load(open("data/test.json"))
+
+ds = Dataset.from_dict(
+    {
+        "full_text": [x["full_text"] for x in data],
+        "document": [x["document"] for x in data],
+        "tokens": [x["tokens"] for x in data],
+        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
+    }
+)
+
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)
+
+
+model = AutoModelForTokenClassification.from_pretrained(model_path)
+collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
+args = TrainingArguments(".", per_device_eval_batch_size=1, report_to="none",)
+
+trainer = Trainer(model=model, args=args, data_collator=collator, tokenizer=tokenizer,)
+
+predictions = trainer.predict(ds).predictions
+pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(
+    predictions.shape[0], predictions.shape[1], 1
+)
+
+config = json.load(open(Path(model_path) / "config.json"))
+id2label = config["id2label"]
+preds = predictions.argmax(-1)
+preds_without_O = pred_softmax[:, :, :12].argmax(-1)
+O_preds = pred_softmax[:, :, 12]
+
+threshold = 0.9
+preds_final = np.where(O_preds < threshold, preds_without_O, preds)
+
+
+triplets = []
+document, token, label, token_str = [], [], [], []
+for p, token_map, offsets, tokens, doc in zip(
+    preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]
+):
+    for token_pred, (start_idx, end_idx) in zip(p, offsets):
+        label_pred = id2label[str(token_pred)]
+        if start_idx + end_idx == 0:
+            continue
+        if token_map[start_idx] == -1:
+            start_idx += 1
+        # ignore "\n\n"
+        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
+            start_idx += 1
+        if start_idx >= len(token_map):
+            break
+
+        token_id = token_map[start_idx]
+
+        # ignore "O" predictions and whitespace preds
+        if label_pred != "O" and token_id != -1:
+            triplet = (label_pred, token_id, tokens[token_id])
+            if triplet not in triplets:
+                document.append(doc)
+                token.append(token_id)
+                label.append(label_pred)
+                token_str.append(tokens[token_id])
+                triplets.append(triplet)
+
+
+df = pd.DataFrame(
+    {"document": document, "token": token, "label": label, "token_str": token_str}
+)
+df["row_id"] = list(range(len(df)))
+# display(df.head(100))
+
+
+df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)
@@ -0,0 +1,142 @@
+from itertools import chain
+from functools import partial
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForTokenClassification,
+    DataCollatorForTokenClassification,
+    Trainer,
+    TrainingArguments,
+)
+from datasets import Dataset
+
+from data import tokenize
+from utils import compute_metrics
+
+import json
+import numpy as np
+
+
+TRAINING_MODEL_PATH = "albert/albert-base-v2"
+TRAINING_MAX_LENGTH = 128
+OUTPUT_DIR = "output"
+
+
+data = json.load(open("data/train.json"))
+
+# downsampling of negative examples
+p = []  # positive samples (contain relevant labels)
+n = (
+    []
+)  # negative samples (presumably contain entities that are possibly wrongly classified as entity)
+for d in data:
+    if any(np.array(d["labels"]) != "O"):
+        p.append(d)
+    else:
+        n.append(d)
+print("original datapoints: ", len(data))
+
+external = json.load(open("data/pii_dataset_fixed.json"))
+print("external datapoints: ", len(external))
+
+moredata = json.load(open("data/moredata_dataset_fixed.json"))
+print("moredata datapoints: ", len(moredata))
+
+data = moredata + external + p + n[: len(n) // 3]
+print("combined: ", len(data))
+
+
+all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
+label2id = {l: i for i, l in enumerate(all_labels)}
+id2label = {v: k for k, v in label2id.items()}
+
+target = [
+    "B-EMAIL",
+    "B-ID_NUM",
+    "B-NAME_STUDENT",
+    "B-PHONE_NUM",
+    "B-STREET_ADDRESS",
+    "B-URL_PERSONAL",
+    "B-USERNAME",
+    "I-ID_NUM",
+    "I-NAME_STUDENT",
+    "I-PHONE_NUM",
+    "I-STREET_ADDRESS",
+    "I-URL_PERSONAL",
+]
+print(id2label)
+
+
+tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
+
+ds = Dataset.from_dict(
+    {
+        "full_text": [x["full_text"] for x in data],
+        "document": [str(x["document"]) for x in data],
+        "tokens": [x["tokens"] for x in data],
+        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
+        "provided_labels": [x["labels"] for x in data],
+    }
+)
+ds = ds.map(
+    tokenize,
+    fn_kwargs={
+        "tokenizer": tokenizer,
+        "label2id": label2id,
+        "max_length": TRAINING_MAX_LENGTH,
+    },
+    num_proc=3,
+)
+# ds = ds.class_encode_column("group")
+
+
+model = AutoModelForTokenClassification.from_pretrained(
+    TRAINING_MODEL_PATH,
+    num_labels=len(all_labels),
+    id2label=id2label,
+    label2id=label2id,
+    ignore_mismatched_sizes=True,
+)
+collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
+
+
+final_ds = ds.train_test_split(
+    test_size=0.2, seed=42
+)  # cannot use stratify_by_column='group'
+final_ds
+
+
+args = TrainingArguments(
+    output_dir=OUTPUT_DIR,
+    fp16=True,
+    learning_rate=2e-5,
+    num_train_epochs=3,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=2,
+    report_to="none",
+    evaluation_strategy="no",
+    do_eval=False,
+    save_total_limit=1,
+    logging_steps=20,
+    lr_scheduler_type="cosine",
+    metric_for_best_model="f1",
+    greater_is_better=True,
+    warmup_ratio=0.1,
+    weight_decay=0.01,
+)
+
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=ds,
+    data_collator=collator,
+    tokenizer=tokenizer,
+    compute_metrics=partial(compute_metrics, all_labels=all_labels),
+)
+
+
+trainer.train()
+
+
+trainer.save_model("output/albert-base-v2_128")
+tokenizer.save_pretrained("output/albert-base-v2_128")
@@ -0,0 +1,25 @@
+from seqeval.metrics import recall_score, precision_score
+
+import numpy as np
+
+
+def compute_metrics(p, all_labels):
+    predictions, labels = p
+    predictions = np.argmax(predictions, axis=2)
+
+    # Remove ignored index (special tokens)
+    true_predictions = [
+        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+    true_labels = [
+        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
+        for prediction, label in zip(predictions, labels)
+    ]
+
+    recall = recall_score(true_labels, true_predictions)
+    precision = precision_score(true_labels, true_predictions)
+    f1_score = (1 + 5 * 5) * recall * precision / (5 * 5 * precision + recall)
+
+    results = {"recall": recall, "precision": precision, "f1": f1_score}
+    return results