Skip to content

Commit d38397b

Browse files
committed
wikipedia n-grams for fast corpus testvoc-ing
1 parent ff41f44 commit d38397b

File tree

7 files changed

+50043
-0
lines changed

7 files changed

+50043
-0
lines changed

.gitattributes

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,14 @@
1212
/apertium-nno.pc.in -text
1313
/autogen.sh -text
1414
/configure.ac -text
15+
dev/make-ngrams.sh -text
1516
/modes.xml -text
1617
/nno.prob -text svneol=unset#unset
1718
/rem-compounds.xsl -text svneol=unset#application/xml
1819
texts/frode_grytten.txt -text
20+
texts/ngrams.1 -text
21+
texts/ngrams.2 -text
22+
texts/ngrams.3 -text
23+
texts/ngrams.4 -text
24+
texts/ngrams.5 -text
1925
texts/udhr.txt -text

dev/make-ngrams.sh

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/bash
2+
3+
set -e -u
4+
5+
if [[ -z ${TMPDIR:-} ]];then
6+
echo "Putting temporary files in $(pwd)/tmp/ ..."
7+
test -d tmp || mkdir tmp
8+
export TMPDIR="$(pwd)"/tmp
9+
trap "rmdir \"$TMPDIR\"" EXIT
10+
# for sort, assumes current disk has more space than /tmp
11+
fi
12+
13+
corp () {
14+
"$@" | tr -s ' ' '\n'
15+
}
16+
17+
hitparade () {
18+
export LC_ALL=C
19+
sort | uniq -c | sort -nr | sed $'s/^ *//;s/ /\t/' | head -10000
20+
}
21+
22+
if [[ $# -eq 0 ]]; then
23+
echo "" >&2
24+
echo "Error: Expecting some command that cats a corpus as argument(s)" >&2
25+
echo "For example:" >&2
26+
echo "$ $0 xzcat ~/corpora/nno.corp.xz" >&2
27+
echo "" >&2
28+
echo "Will create files ngrams.[1-5]" >&2
29+
exit 1
30+
fi
31+
32+
test -f ngrams.1 || paste <(corp "$@") | hitparade >ngrams.1
33+
test -f ngrams.2 || paste <(corp "$@") <(corp "$@"|tail -n+2) | hitparade >ngrams.2
34+
test -f ngrams.3 || paste <(corp "$@") <(corp "$@"|tail -n+2) <(corp "$@"|tail -n+3) | hitparade >ngrams.3
35+
test -f ngrams.4 || paste <(corp "$@") <(corp "$@"|tail -n+2) <(corp "$@"|tail -n+3) <(corp "$@"|tail -n+4) | hitparade >ngrams.4
36+
test -f ngrams.5 || paste <(corp "$@") <(corp "$@"|tail -n+2) <(corp "$@"|tail -n+3) <(corp "$@"|tail -n+4) <(corp "$@"|tail -n+5) | hitparade >ngrams.5
37+

0 commit comments

Comments
 (0)