ColBERTSaR/beir.sh at main · hltcoe/ColBERTSaR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/bin/bash
# Reproduce BEIR experiments end-to-end:
#   1. segment documents into passages
#   2. train per-corpus centroids and build the sparse index
#   3. search queries and write a TREC run
#
# Adjust the variables in the configuration block below to suit your setup.

set -e

# ---------- configuration ----------
COLBERT_CHECKPOINT="answerdotai/answerai-colbert-small-v1"

# Where to place the segmented passages collection
PASSAGES_ROOT="./collections/beir"
PASSAGE_LENGTH=512
PASSAGE_STRIDE=512

# Where to place experiment indexes and run files
EXP_PREFIX="./experiments/beir"

# Subsets to run
SUBSETS=(arguana climate-fever dbpedia-entity fever fiqa hotpotqa msmarco nfcorpus nq quora scidocs scifact trec-covid webis-touche2020)

# nprobe values to search with
NPROBES=(2 4 8 16)

# Hardware
NGPU=8
NCPU=32
# -----------------------------------


# 1) Passaging -- segment documents into fixed-length passages
for subset in "${SUBSETS[@]}"; do
    python passaging.py \
        --output_dir $PASSAGES_ROOT/$subset/passages/${PASSAGE_LENGTH}-${PASSAGE_STRIDE} \
        --passage_length $PASSAGE_LENGTH \
        --passage_stride $PASSAGE_STRIDE \
        --doc_collections irds:beir/$subset \
        --docid_field doc_id \
        --num_workers $NCPU \
        --overwrite
done


# 2) Train centroids and build the sparse index
for subset in "${SUBSETS[@]}"; do

    # large corpora use more centroids
    case "$subset" in
        trec-covid|climate-fever|dbpedia-entity|fever|hotpotqa|msmarco|nq)
            nc=1000000;;
        *)
            nc=500000;;
    esac

    output_dir="${EXP_PREFIX}/beir-${subset}_in-batch_cen${nc}_step100000"

    torchrun --nproc_per_node=$NGPU index.py \
        --fp16 \
        --n_centroids $nc \
        --colbert_checkpoint $COLBERT_CHECKPOINT \
        --max_steps 100000 \
        --learning_rate 1e-4 \
        --per_device_train_batch_size 2048 \
        --per_device_eval_batch_size 32 \
        --save_total_limit 2 \
        --save_steps 1000 \
        --chunk_size 100000 \
        --output_dir $output_dir \
        --collection $PASSAGES_ROOT/$subset/passages/${PASSAGE_LENGTH}-${PASSAGE_STRIDE}/collection_passages.tsv \
        --passage_mapping $PASSAGES_ROOT/$subset/passages/${PASSAGE_LENGTH}-${PASSAGE_STRIDE}/mapping.tsv \
        --clean_up_samples \
        --resume

done


# 3) Search
for subset in "${SUBSETS[@]}"; do

    case "$subset" in
        trec-covid|climate-fever|dbpedia-entity|fever|hotpotqa|msmarco|nq)
            nc=1000000;;
        *)
            nc=500000;;
    esac

    # not every BEIR subset has a /test split in ir_datasets
    case "$subset" in
        nfcorpus|scifact|fiqa|quora|dbpedia-entity|fever|hotpotqa|msmarco)
            qsubset="$subset/test";;
        *)
            qsubset="$subset";;
    esac

    index_dir="${EXP_PREFIX}/beir-${subset}_in-batch_cen${nc}_step100000"

    for nprobe in "${NPROBES[@]}"; do
        torchrun --nproc_per_node=$NCPU search.py \
            --fp16 \
            --index_dir $index_dir \
            --queries irds:beir/$qsubset \
            --qrels irds:beir/$qsubset \
            --per_device_eval_batch_size 64 \
            --nprobe $nprobe \
            --use_forward_index \
            --search_output ${index_dir}_np${nprobe}.trec
    done

done