-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathbeir.sh
More file actions
executable file
·112 lines (90 loc) · 3.23 KB
/
Copy pathbeir.sh
File metadata and controls
executable file
·112 lines (90 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
# Reproduce BEIR experiments end-to-end:
# 1. segment documents into passages
# 2. train per-corpus centroids and build the sparse index
# 3. search queries and write a TREC run
#
# Adjust the variables in the configuration block below to suit your setup.
set -e
# ---------- configuration ----------
COLBERT_CHECKPOINT="answerdotai/answerai-colbert-small-v1"
# Where to place the segmented passages collection
PASSAGES_ROOT="./collections/beir"
PASSAGE_LENGTH=512
PASSAGE_STRIDE=512
# Where to place experiment indexes and run files
EXP_PREFIX="./experiments/beir"
# Subsets to run
SUBSETS=(arguana climate-fever dbpedia-entity fever fiqa hotpotqa msmarco nfcorpus nq quora scidocs scifact trec-covid webis-touche2020)
# nprobe values to search with
NPROBES=(2 4 8 16)
# Hardware
NGPU=8
NCPU=32
# -----------------------------------
# 1) Passaging -- segment documents into fixed-length passages
for subset in "${SUBSETS[@]}"; do
python passaging.py \
--output_dir $PASSAGES_ROOT/$subset/passages/${PASSAGE_LENGTH}-${PASSAGE_STRIDE} \
--passage_length $PASSAGE_LENGTH \
--passage_stride $PASSAGE_STRIDE \
--doc_collections irds:beir/$subset \
--docid_field doc_id \
--num_workers $NCPU \
--overwrite
done
# 2) Train centroids and build the sparse index
for subset in "${SUBSETS[@]}"; do
# large corpora use more centroids
case "$subset" in
trec-covid|climate-fever|dbpedia-entity|fever|hotpotqa|msmarco|nq)
nc=1000000;;
*)
nc=500000;;
esac
output_dir="${EXP_PREFIX}/beir-${subset}_in-batch_cen${nc}_step100000"
torchrun --nproc_per_node=$NGPU index.py \
--fp16 \
--n_centroids $nc \
--colbert_checkpoint $COLBERT_CHECKPOINT \
--max_steps 100000 \
--learning_rate 1e-4 \
--per_device_train_batch_size 2048 \
--per_device_eval_batch_size 32 \
--save_total_limit 2 \
--save_steps 1000 \
--chunk_size 100000 \
--output_dir $output_dir \
--collection $PASSAGES_ROOT/$subset/passages/${PASSAGE_LENGTH}-${PASSAGE_STRIDE}/collection_passages.tsv \
--passage_mapping $PASSAGES_ROOT/$subset/passages/${PASSAGE_LENGTH}-${PASSAGE_STRIDE}/mapping.tsv \
--clean_up_samples \
--resume
done
# 3) Search
for subset in "${SUBSETS[@]}"; do
case "$subset" in
trec-covid|climate-fever|dbpedia-entity|fever|hotpotqa|msmarco|nq)
nc=1000000;;
*)
nc=500000;;
esac
# not every BEIR subset has a /test split in ir_datasets
case "$subset" in
nfcorpus|scifact|fiqa|quora|dbpedia-entity|fever|hotpotqa|msmarco)
qsubset="$subset/test";;
*)
qsubset="$subset";;
esac
index_dir="${EXP_PREFIX}/beir-${subset}_in-batch_cen${nc}_step100000"
for nprobe in "${NPROBES[@]}"; do
torchrun --nproc_per_node=$NCPU search.py \
--fp16 \
--index_dir $index_dir \
--queries irds:beir/$qsubset \
--qrels irds:beir/$qsubset \
--per_device_eval_batch_size 64 \
--nprobe $nprobe \
--use_forward_index \
--search_output ${index_dir}_np${nprobe}.trec
done
done