-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_data.sh
More file actions
executable file
·63 lines (47 loc) · 1.42 KB
/
get_data.sh
File metadata and controls
executable file
·63 lines (47 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env bash
set -uo pipefail
TARGET_DIR="./data/temp"
mkdir -p "$TARGET_DIR"
BASE_URL="https://www.pgnmentor.com/files.html"
HTML_TMP=$(mktemp)
echo "Fetching file list from $BASE_URL..."
curl -sSf "$BASE_URL" -o "$HTML_TMP"
echo "Parsing ZIP links..."
mapfile -t links < <(grep -oP 'href="\K[^"]*players/[^"]+\.zip' "$HTML_TMP" | sort -u || true)
if [ ${#links[@]} -eq 0 ]; then
echo "No .zip links found — site format may have changed."
rm "$HTML_TMP"
exit 1
fi
echo "Found ${#links[@]} ZIP files. Downloading and extracting..."
total=${#links[@]}
count=0
for link in "${links[@]}"; do
((count++))
if (( count == 10 )); then
break
fi
# Resolve URL
if [[ "$link" =~ ^http ]]; then
url="$link"
else
url="https://www.pgnmentor.com/$link"
fi
fname=$(basename "$link")
dl_path="$TARGET_DIR/$fname"
# Progress display
progress=$(( count * 100 / total ))
bar_len=$(( progress / 2 ))
bar=$(printf "%0.s#" $(seq 1 $bar_len))
empty=$(printf "%0.s-" $(seq 1 $((50 - bar_len))))
printf "\r[%s%s] %d%% (%d/%d)" "$bar" "$empty" "$progress" "$count" "$total"
# Download and unzip
curl -sSf "$url" -o "$dl_path"
unzip -qqj "$dl_path" -d "$TARGET_DIR"
rm "$dl_path"
done
echo
python3 -c "from src.data_preprocessing import filter_games; filter_games()"
rm -r "$TARGET_DIR"
echo "All files downloaded and extracted."
rm "$HTML_TMP"