File tree Expand file tree Collapse file tree 7 files changed +50043
-0
lines changed Expand file tree Collapse file tree 7 files changed +50043
-0
lines changed Original file line number Diff line number Diff line change 12
12
/apertium-nno.pc.in - text
13
13
/autogen.sh - text
14
14
/configure.ac - text
15
+ dev /make-ngrams.sh - text
15
16
/modes.xml - text
16
17
/nno.prob - text svneol =unset#unset
17
18
/rem-compounds.xsl - text svneol =unset#application/xml
18
19
texts /frode_grytten.txt - text
20
+ texts /ngrams.1 - text
21
+ texts /ngrams.2 - text
22
+ texts /ngrams.3 - text
23
+ texts /ngrams.4 - text
24
+ texts /ngrams.5 - text
19
25
texts /udhr.txt - text
Original file line number Diff line number Diff line change
1
+ #! /bin/bash
2
+
3
+ set -e -u
4
+
5
+ if [[ -z ${TMPDIR:- } ]]; then
6
+ echo " Putting temporary files in $( pwd) /tmp/ ..."
7
+ test -d tmp || mkdir tmp
8
+ export TMPDIR=" $( pwd) " /tmp
9
+ trap " rmdir \" $TMPDIR \" " EXIT
10
+ # for sort, assumes current disk has more space than /tmp
11
+ fi
12
+
13
+ corp () {
14
+ " $@ " | tr -s ' ' ' \n'
15
+ }
16
+
17
+ hitparade () {
18
+ export LC_ALL=C
19
+ sort | uniq -c | sort -nr | sed $' s/^ *//;s/ /\t /' | head -10000
20
+ }
21
+
22
+ if [[ $# -eq 0 ]]; then
23
+ echo " " >&2
24
+ echo " Error: Expecting some command that cats a corpus as argument(s)" >&2
25
+ echo " For example:" >&2
26
+ echo " $ $0 xzcat ~/corpora/nno.corp.xz" >&2
27
+ echo " " >&2
28
+ echo " Will create files ngrams.[1-5]" >&2
29
+ exit 1
30
+ fi
31
+
32
+ test -f ngrams.1 || paste <( corp " $@ " ) | hitparade > ngrams.1
33
+ test -f ngrams.2 || paste <( corp " $@ " ) <( corp " $@ " | tail -n+2) | hitparade > ngrams.2
34
+ test -f ngrams.3 || paste <( corp " $@ " ) <( corp " $@ " | tail -n+2) <( corp " $@ " | tail -n+3) | hitparade > ngrams.3
35
+ test -f ngrams.4 || paste <( corp " $@ " ) <( corp " $@ " | tail -n+2) <( corp " $@ " | tail -n+3) <( corp " $@ " | tail -n+4) | hitparade > ngrams.4
36
+ test -f ngrams.5 || paste <( corp " $@ " ) <( corp " $@ " | tail -n+2) <( corp " $@ " | tail -n+3) <( corp " $@ " | tail -n+4) <( corp " $@ " | tail -n+5) | hitparade > ngrams.5
37
+
You can’t perform that action at this time.
0 commit comments