ritaranx
diff --git a/‎README.md
Lines changed: 65 additions & 1 deletion b/‎README.md
Lines changed: 65 additions & 1 deletion
diff --git a/‎commands/run_agnews.sh
Lines changed: 56 additions & 0 deletions b/‎commands/run_agnews.sh
Lines changed: 56 additions & 0 deletions
diff --git a/‎eval.py
Lines changed: 107 additions & 0 deletions b/‎eval.py
Lines changed: 107 additions & 0 deletions
@@ -1,2 +1,66 @@
 # NeST
-[AAAI 2023] This is the code for our paper `Neighborhood-Regularized Self-Training for Learning with Few Labels'.
+
+This is the code for the paper `[Neighborhood-regularized Self-training for Learning with Few Labels]()' (In Proceedings of AAAI 2023).
+
+# Requirements
+```
+python 3.7
+transformers==4.2.0
+pytorch==1.8.0
+tqdm
+scikit-learn
+faiss-cpu==1.6.4
+```
+
+# Datasets
+## Datasets
+
+The datasets used in this study can be find at the following link
+
+|   Dataset   | Task  | Number of Classes | Number of Train/Test |
+|---------------- | -------------- |-------------- | -------------- |
+| [Elec](http://riejohnson.com/cnn_data.html) |    Sentiment    |      2      |  25K / 25K   |
+| [AG News](https://huggingface.co/datasets/ag_news)    |    News Topic          |     2   |  120K / 7.6K  |
+| [NYT](https://github.com/yumeng5/CatE/tree/master/datasets/nyt)  |  News Topic   |    4     |     30K / 3.0K    |
+| [Chemprot](https://github.com/yueyu1030/COSINE/tree/main/data/chemprot)     |     Chemical Relation      |    10    |    12K / 1.6K     |
+
+## Input Format
+"_id" stands for the class id, and "text" is the content of the document.
+```
+    {"_id": 0, "text": "Congo Official: Rwanda Troops Attacking (AP) AP - A senior Congolese official said Tuesday his nation had been invaded by neighboring Rwanda, and U.N. officials said they were investigating claims of Rwandan forces clashing with militias in the east."}
+    {"_id": 1, "text": "Stadler Leads First Tee Open (AP) AP - Craig Stadler moved into position for his second straight victory Saturday, shooting a 9-under 63 to take a one-stroke lead over Jay Haas after the second round of the inaugural First Tee Open."}
+    {"_id": 2, "text": "Intel Shares Edge Lower After Downgrade  NEW YORK (Reuters) - Intel Corp shares slipped on  Tuesday after Credit Suisse First Boston downgraded the stock,  forecasting that the computer chip maker will have difficulty  outperforming the overall semiconductor sector next year."}
+    {"_id": 3, "text": "Debating the Dinosaur Extinction At least 50 percent of the world's species, including the dinosaurs, went extinct 65 million years ago. While most scientists now blame this catastrophe on a large meteorite impact, others wonder if there is more to the story."}
+    ...
+}
+```
+
+## Training
+Please use the commands in `commands` folder for experiments.
+Take AG News dataset as an example, `run_agnews.sh` is used for running the experiment for self-training.
+
+
+
+# Hyperparameter Tuning
+Some Key Hyperparameters are listed as follows
+- `k`: The number of nearest neighbors used in KNN.
+- `learning_rate`: The learning rate for initialzation.
+- `learning_rate_st`: The learning rate for self-training.
+- `self_training_update_period`:  The update period of self-training. 
+- `self_training_weight`: The weight to balance labeled data and unlabeled data during self-training.
+- `num_unlabeled`:  The number of unlabeled data in the beginning.
+- `num_unlabeled_add`: The number of added unlabeled data in each self-training round. 
+
+
+# Citation 
+
+Please kindly cite the following paper if you are using our datasets/codebase. Thanks!
+
+```
+@inproceedings{xu2023neighborhood,
+    title = "Neighborhood-regularized Self-training for Learning with Few Labels",
+    author = "Ran Xu and Yue Yu and Hejie Cui and Xuan Kan and Yanqiao Zhu and Joyce C. Ho and Chao Zhang and Carl Yang",
+    booktitle = "Proceedings of the Thirty-Seventh AAAI Conference on Artificial Intelligence",
+    year = "2023",
+}
+```
@@ -0,0 +1,56 @@
+task=agnews 
+gpu=0
+n_gpu=2
+
+train_seed=42
+label_per_class=30
+model_type=roberta-base
+train_seed=${train_seed}
+method=train
+max_seq_len=128
+self_training_batch_size=32
+eval_batch_size=256
+dev_labels=100
+steps=100
+logging_steps=10
+st_logging_steps=20
+epochs=18
+k=5
+
+lr=2e-5
+eps=0.9
+self_training_weight=1
+gce_loss_q=0.6
+lr_st=1e-5
+batch_size=8
+self_training_batch_size=32
+self_training_update_period=1000
+self_training_max_step=2000
+num_unlabeled=2000
+num_unlabeled_add=2000
+ssl_cmd="--learning_rate_st=${lr_st} --self_training_eps=${eps} --self_training_weight=${self_training_weight} --self_training_update_period=${self_training_update_period} --gce_loss_q=${gce_loss_q} --num_unlabeled=${num_unlabeled} --num_unlabeled_add=${num_unlabeled_add}"
+
+
+model_type=${model_type} #dmis-lab/biobert-v1.1 #"allenai/scibert_scivocab_uncased"
+output_dir=${task}/${label_per_class}/model #../datasets/${task}-${label_per_class}-10/model
+mkdir -p ${output_dir}
+echo ${method}
+mkdir -p ../datasets/${task}-${label_per_class}/cache
+# valid_${train_label}.json
+train_cmd="CUDA_VISIBLE_DEVICES=${gpu} python3 main.py --do_train --do_eval --task=${task} \
+	--train_file=train.json --dev_file=valid.json --test_file=test.json \
+	--unlabel_file=unlabeled.json \
+	--data_dir=../datasets/${task}-${label_per_class} --train_seed=${train_seed} \
+	--cache_dir="../datasets/${task}-${label_per_class}/cache" \
+	--output_dir=${output_dir} \
+	--logging_steps=${logging_steps} --self_train_logging_steps=${st_logging_steps} --dev_labels=${dev_labels} \
+	--gpu=${gpu} --n_gpu=${n_gpu} --num_train_epochs=${epochs} --weight_decay=1e-8 \
+	--learning_rate=${lr}  \
+	--method=${method} --batch_size=${batch_size} --eval_batch_size=${eval_batch_size} \
+	--self_training_batch_size=${self_training_batch_size} \
+	--max_seq_len=${max_seq_len} --auto_load=1 \
+	--max_steps=${steps} --model_type=${model_type} \
+	--self_training_max_step=${self_training_max_step} \
+	--sample_labels=${train_label} ${ssl_cmd} --k=${k} --label_per_class=${label_per_class}"
+echo $train_cmd
+eval $train_cmd
@@ -0,0 +1,107 @@
+import faiss 
+import numpy as np 
+import os 
+
+def inference_knn(train_pred, train_feat, train_label,  unlabeled_pred, unlabeled_feat, unlabeled_label, unlabeled_pseudo,k,  gamma = 0.1, beta=0.1, prev_val = None):
+    train_pred = np.array(train_pred)
+    unlabeled_pred = np.array(unlabeled_pred)
+    d = train_feat.shape[-1]
+    index = faiss.IndexFlatL2(d)
+    index.add(train_feat)
+    D, I = index.search(unlabeled_feat, k)
+    unlabeled_pred =  np.expand_dims(unlabeled_pred, axis = 1)
+    # [#unlabel, 1]
+    # train_pred[I] ---> [#unlabel, k]
+    # print(unlabeled_pred.shape)
+    score = np.log((1e-10 + train_pred[I])/ (1e-10 + unlabeled_pred)) * train_pred[I]
+    # print(score.shape)
+    mean_kl = np.mean(np.sum(score, axis = -1), axis = -1)
+
+    # mean_mse =  np.mean((train_pred[I] - unlabeled_pred)**2, axis = -1)
+    # train pred (n_samples, n_class)
+    # train pred[I] (n_samples, n_neighbor, n_class)
+    var_mse =  np.var(train_pred[I], axis = -1)
+
+    if prev_val is not None:
+        current_val = prev_val * gamma + (1- gamma) * (mean_kl + var_mse * beta)
+    else:
+        current_val = mean_kl + var_mse * beta
+    idx = np.argsort(current_val)
+
+    return idx
+
+def inference_conf(train_pred, train_feat, train_label,  unlabeled_pred, unlabeled_feat, unlabeled_label, unlabeled_pseudo, gamma = 0.1, prev_val = None):
+    train_pred = np.array(train_pred)
+    unlabeled_pred = np.array(unlabeled_pred)
+    current_val = -np.max(unlabeled_pred, axis = -1)
+    if prev_val is not None:
+        current_val = prev_val * gamma + (1- gamma) * (current_val)
+    else:
+        current_val = current_val
+    idx = np.argsort(current_val)
+
+    return idx
+
+def inference_uncertainty(unlabeled_label, unlabeled_pseudo, mutual_info, gamma = 0.1, prev_val = None):
+    if prev_val is not None:
+        current_val = prev_val * gamma + (1- gamma) * (mutual_info)
+    else:
+        current_val = mutual_info
+    idx = np.argsort(current_val)
+   
+    return idx
+
+def save_data(train_pred, train_feat, train_label,  unlabeled_pred, unlabeled_feat, unlabeled_label, unlabeled_pseudo, dataset = 'agnews',  n_labels = 10, n_iter = 0):
+    if n_iter == 0:
+        path = f"{dataset}/{n_labels}"
+        
+    else:
+        path = f"{dataset}/{n_labels}_{n_iter}"
+    os.makedirs(path, exist_ok = True)
+    
+    with open(f"{path}/train_pred.npy", 'wb') as f:
+        np.save(f, train_pred)
+    
+    with open(f"{path}/train_feat.npy", 'wb') as f:
+        np.save(f, train_feat)
+    
+    with open(f"{path}/train_label.npy", 'wb') as f:
+        np.save(f, train_label)
+
+    with open(f"{path}/unlabeled_pred.npy", 'wb') as f:
+        np.save(f, unlabeled_pred)
+
+    with open(f"{path}/unlabeled_feat.npy", 'wb') as f:
+        np.save(f, unlabeled_feat)
+    
+    with open(f"{path}/unlabeled_label.npy", 'wb') as f:
+        np.save(f, unlabeled_label)
+    
+    with open(f"{path}/unlabeled_pseudo.npy", 'wb') as f:
+        np.save(f, unlabeled_pseudo)
+
+
+
+
+def load_pred_data(dataset = 'agnews', n_labels = 10, n_iter = 0):
+    # os.makedirs(f"{dataset}/{n_labels}", exist_ok = True)
+    # with open(f"{dataset}/{n_labels}/train_pred.npy", 'rb') as f:
+    if n_iter == 0:
+        path = f"{dataset}/{n_labels}"
+    else:
+        path = f"{dataset}/{n_labels}_{n_iter}"
+    train_pred = np.load(f"{path}/train_pred.npy")
+
+    train_feat = np.load(f"{path}/train_feat.npy")
+
+    train_label = np.load(f"{path}/train_label.npy")
+
+    unlabeled_pred = np.load(f"{path}/unlabeled_pred.npy")
+
+    unlabeled_feat = np.load(f"{path}/unlabeled_feat.npy")
+
+    unlabeled_label = np.load(f"{path}/unlabeled_label.npy")
+    
+    unlabeled_pseudo = np.load(f"{path}/unlabeled_pseudo.npy")
+
+    return train_pred, train_feat, train_label,  unlabeled_pred, unlabeled_feat, unlabeled_label, unlabeled_pseudo