Skip to content

Commit c524ad1

Browse files
committed
start to upload script for korp
1 parent 0c578d4 commit c524ad1

File tree

7 files changed

+494
-0
lines changed

7 files changed

+494
-0
lines changed

korp_scripts/timedata/README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Steps to creat/update mysql tables for time data.
2+
3+
1. Extract time stamps from the vrt corpus
4+
Adjust `cLang, cDomain, date` variables in extract*time_stamp.xsl
5+
The script expects vrt file in folder `vrt_<cLang>_<date>`
6+
7+
2. Run the script
8+
`java -Xmx16800m -Dfile.encoding=UTF8 net.sf.saxon.Transform -it:main extract_time_stamp.xsl`
9+
If the above doesn't work specify the path to saxon9.jar
10+
`java -Xmx2048m -cp ~/main/tools/TermWikiExporter/lib/saxon9.jar -Dfile.encoding=UTF8 net.sf.saxon.Transform -it:main extract_time_stamp.xsl`
11+
output:
12+
`timestamp_<cLang>_<date>/metacheck_<cLang>_<cDomain>_<date>.txt`
13+
14+
3. Sort-uniq all extracted years
15+
`awk '{print \$2}' metacheck_<cLang>_<cDomain>_<date>.txt |s|u > all_years_<cLang>_<cDomain>.txt`
16+
17+
4. Adjust `lang, domain, date` variables in generate_tables.sh
18+
19+
5. Run the script
20+
`sh generate_tables.sh`
21+
output: `timespan_<cLang>_<cDomain>_<date>.sql`
22+
23+
6. Open `timespan_<cLang>_<cDomain>_<date>.sql` and replace `TESTCORPUS` by `<cLang>_<cDomain>_<date>` (in capitals)
24+
at `timespan_<cLang>_<cDomain>_<date>.sql`
25+
26+
7. Import timespan in mysql:
27+
`cat timespan_<cLang>_<cDomain>_<date>.sql | mysql -u korp -p korp_DB`
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
<?xml version="1.0"?>
2+
<!--+
3+
|
4+
| change the 2004-xml-spreadsheet XML files into a simpler xml format
5+
| Usage: java net.sf.saxon.Transform -it main STYLESHEET_NAME.xsl inDir=INPUT_DIR
6+
+-->
7+
8+
<xsl:stylesheet version="2.0"
9+
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
10+
xmlns:xs="http://www.w3.org/2001/XMLSchema"
11+
exclude-result-prefixes="xsl xs">
12+
13+
<xsl:strip-space elements="*"/>
14+
15+
<xsl:output method="xml" name="xml"
16+
encoding="UTF-8"
17+
omit-xml-declaration="yes"
18+
indent="no"/>
19+
<xsl:output method="xml" name="html"
20+
encoding="UTF-8"
21+
omit-xml-declaration="yes"
22+
indent="yes"/>
23+
<xsl:output method="text" name="txt"
24+
encoding="UTF-8"/>
25+
26+
<xsl:param name="inDir" select="concat('vrt_', $cLang, '_', $date)"/>
27+
<xsl:param name="date" select="'20210520'"/>
28+
<xsl:param name="cDomain" select="'science'"/>
29+
<xsl:param name="cLang" select="'smj'"/>
30+
31+
<xsl:variable name="cID" select="concat($cLang,'_',$cDomain,'_',$date)"/>
32+
<xsl:variable name="outDir" select="concat('timestamp_',$cLang,'_',$date)"/>
33+
<xsl:variable name="fileName" select="concat('metacheck_',$cLang,'_',$cDomain,'_',$date)"/>
34+
35+
<xsl:variable name="oe" select="'txt'"/>
36+
<xsl:variable name="tb" select="'&#9;'"/>
37+
<xsl:variable name="nl" select="'&#xA;'"/>
38+
<xsl:variable name="debug" select="false()"/>
39+
<xsl:variable name="ws" select="'&#x20;&#xD;&#xA;&#x9;'"/>
40+
41+
<xsl:template match="/" name="main">
42+
<xsl:message terminate="no">
43+
<xsl:value-of select="concat('Processing data from dir: ', $inDir)"/>
44+
</xsl:message>
45+
46+
<!-- output -->
47+
<xsl:result-document href="{$outDir}/{$fileName}.txt" format="{$oe}">
48+
49+
<xsl:for-each select="for $f in collection(concat($inDir, '?select=*.vrt;recurse=yes;on-error=warning')) return $f">
50+
51+
<xsl:variable name="current_file" select="substring-before((tokenize(document-uri(.), '/'))[last()], '.vrt')"/>
52+
<xsl:variable name="current_dir" select="substring-before(document-uri(.), $current_file)"/>
53+
<xsl:variable name="current_location" select="concat($inDir, substring-after($current_dir, $inDir))"/>
54+
55+
<xsl:call-template name="processFile">
56+
<xsl:with-param name="file" select="."/>
57+
<xsl:with-param name="name" select="$current_file"/>
58+
<xsl:with-param name="ie" select="'vrt'"/>
59+
<xsl:with-param name="relPath" select="$current_location"/>
60+
</xsl:call-template>
61+
</xsl:for-each>
62+
</xsl:result-document>
63+
64+
</xsl:template>
65+
66+
<!-- process file -->
67+
<xsl:template name="processFile">
68+
<xsl:param name="file"/>
69+
<xsl:param name="name"/>
70+
<xsl:param name="ie"/>
71+
<xsl:param name="relPath"/>
72+
73+
<xsl:message terminate="no">
74+
<xsl:value-of select="concat('file: ', $relPath, $name, '.', $ie)"/>
75+
</xsl:message>
76+
77+
<xsl:for-each select="$file//text">
78+
<xsl:value-of select="concat(./@datefrom, ' ', ./@dateto, ' ', ./@token_count, ' ', $nl)"/>
79+
</xsl:for-each>
80+
81+
<xsl:if test="$debug">
82+
<xsl:message terminate="no">
83+
<xsl:value-of select="concat(' Done!',' Output file ',$name,' in: ', $outDir)"/>
84+
</xsl:message>
85+
</xsl:if>
86+
87+
</xsl:template>
88+
89+
</xsl:stylesheet>
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/bin/sh
2+
3+
# ${string,,} toLower
4+
# ${string^^} toUpper
5+
# ${string,,[AEIUO]}
6+
# ${string^^[aeiou]}
7+
8+
#l_corpus_name="sms_wikipedia_20161208"
9+
#u_corpus_name=${l_corpus_name^^}
10+
11+
lang="smj"
12+
ulang=$(echo $lang | tr '[a-z]' '[A-Z]')
13+
domain="science"
14+
udomain=$(echo $domain | tr '[a-z]' '[A-Z]')
15+
date="20210520"
16+
metafile="timestamp_"${lang}"_"${date}"/metacheck_"${lang}"_"${domain}"_"${date}".txt"
17+
db=${ulang}"_"${udomain}"_"${date}
18+
ylist='all_years_'${ulang}'_'${udomain}'.txt'
19+
target="timespan_${lang}_${domain}_${date}.sql"
20+
21+
cp form_timespan.sql $target
22+
23+
#sed -i 's/TESTCORPUS/${db}/g' $target
24+
25+
awk '{print $2}' $metafile |sort|uniq> $ylist
26+
27+
for y in $(cat $ylist)
28+
do
29+
echo "year is $y"
30+
year=$(echo $y |cut -c 1-4)
31+
sum=$(grep "^$y" $metafile|awk '{count=count+$NF}END{print count}')
32+
echo "INSERT INTO \`timedata\` (corpus, datefrom, dateto, tokens) VALUES">>$target
33+
echo "('${db}', '${year}0101000000', '${year}1231235959', $sum);">>$target
34+
echo "INSERT INTO \`timedata_date\` (corpus, datefrom, dateto, tokens) VALUES">>$target
35+
echo "('${db}', '${year}0101', '${year}1231', $sum);">>$target
36+
echo "==============="
37+
done

korp_scripts/word_picture/README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Steps to creat/update mysql tables for word picture.
2+
Skip step 0 if korp database and korp user already exist.
3+
4+
0. Create korp database korp_DB, create the user korp and grant all access to database
5+
`mysql -u root -p`
6+
In mysql shell run:
7+
`CREATE DATABASE korp_DB character set utf8 collate utf8_bin;`
8+
`CREATE USER 'korp'@'localhost' IDENTIFIED BY <password>`
9+
`GRANT ALL ON korp_DB.* TO korp@localhost;`
10+
NB. The names for database, user and password are the same in settings_not_in_svn.py (see point 2)
11+
12+
1. Create/reset tables in korp_DB
13+
`mysql -u root -p`
14+
NB. First replace corpus name as needed in `_relations.sql`, i.e. replace "SME_ADMIN_20181106".
15+
In mysql shell run:
16+
`source ~/main/apps/korp/word_picture/_relations.sql`
17+
18+
2. Fill mysql tables
19+
NB. All paths and password are stored in settings_not_in_svn.py, which is not in svn!
20+
Copy settings_not_in_svn.template to settings_not_in_svn.py and replace paths and passwords as needed.
21+
`python insert.py`
22+
NB. In the current version, the script will fill tables with strings and relations present in corpus.
23+
It is possible to search for both base forms and word forms, but results are collected by base form.
24+
This means that if "lean" is the search word (which has base form "leat"), all relations for word forms with lemma "leat" are presented.
25+
The results are always displayed as base forms (same as for the swedish version).
26+
27+
3. Whenever corpus/tables update run:
28+
`rm -rf /tmp/gt_korp_2018_WSGI/*`
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
SET @@session.long_query_time = 1000;
2+
DROP TABLE IF EXISTS `temp_relations_SME_ADMIN_20181106`;
3+
CREATE TABLE `temp_relations_SME_ADMIN_20181106` (
4+
`id` int(11) NOT NULL DEFAULT 0,
5+
`head` int(11) NOT NULL DEFAULT 0,
6+
`rel` varchar(15) NOT NULL DEFAULT 'V',
7+
`dep` int(11) NOT NULL DEFAULT 0,
8+
`freq` int(11) NOT NULL DEFAULT 0,
9+
`bfhead` BOOL NOT NULL,
10+
`bfdep` BOOL NOT NULL,
11+
`wfhead` BOOL NOT NULL,
12+
`wfdep` BOOL NOT NULL,
13+
PRIMARY KEY (`head`, `wfhead`, `dep`, `rel`),
14+
INDEX `dep-wfdep-head-rel-freq-id` (`dep`, `wfdep`, `head`, `rel`, `freq`, `id`),
15+
INDEX `head-dep-bfhead-bfdep-rel-freq-id` (`head`, `dep`, `bfhead`, `bfdep`, `rel`, `freq`, `id`),
16+
INDEX `dep-head-bfhead-bfdep-rel-freq-id` (`dep`, `head`, `bfhead`, `bfdep`, `rel`, `freq`, `id`)) default charset = utf8 row_format = compressed ;
17+
DROP TABLE IF EXISTS `temp_relations_SME_ADMIN_20181106_strings`;
18+
CREATE TABLE `temp_relations_SME_ADMIN_20181106_strings` (
19+
`id` int(11) NOT NULL DEFAULT 0,
20+
`string` varchar(500) NOT NULL DEFAULT '',
21+
`stringextra` varchar(32) NOT NULL DEFAULT '',
22+
`pos` varchar(15) NOT NULL DEFAULT '',
23+
`lemma` varchar(500) NOT NULL DEFAULT '',
24+
PRIMARY KEY (`string`, `id`, `pos`, `stringextra`),
25+
INDEX `id-string-pos-stringextra` (`id`, `string`, `pos`, `stringextra`)) default charset = utf8 collate = utf8_bin row_format = compressed ;
26+
DROP TABLE IF EXISTS `temp_relations_SME_ADMIN_20181106_rel`;
27+
CREATE TABLE `temp_relations_SME_ADMIN_20181106_rel` (
28+
`rel` varchar(15) NOT NULL DEFAULT 'V',
29+
`freq` int(11) NOT NULL DEFAULT 0,
30+
PRIMARY KEY (`rel`)) default charset = utf8 collate = utf8_bin row_format = compressed ;
31+
DROP TABLE IF EXISTS `temp_relations_SME_ADMIN_20181106_head_rel`;
32+
CREATE TABLE `temp_relations_SME_ADMIN_20181106_head_rel` (
33+
`head` int(11) NOT NULL DEFAULT 0,
34+
`rel` varchar(15) NOT NULL DEFAULT 'V',
35+
`freq` int(11) NOT NULL DEFAULT 0,
36+
PRIMARY KEY (`head`, `rel`)) default charset = utf8 collate = utf8_bin row_format = compressed ;
37+
DROP TABLE IF EXISTS `temp_relations_SME_ADMIN_20181106_dep_rel`;
38+
CREATE TABLE `temp_relations_SME_ADMIN_20181106_dep_rel` (
39+
`dep` int(11) NOT NULL DEFAULT 0,
40+
`rel` varchar(15) NOT NULL DEFAULT 'V',
41+
`freq` int(11) NOT NULL DEFAULT 0,
42+
PRIMARY KEY (`dep`, `rel`)) default charset = utf8 collate = utf8_bin row_format = compressed ;
43+
DROP TABLE IF EXISTS `temp_relations_SME_ADMIN_20181106_sentences`;
44+
CREATE TABLE `temp_relations_SME_ADMIN_20181106_sentences` (
45+
`id` int(11) DEFAULT NULL,
46+
`sentence` varchar(64) NOT NULL DEFAULT '',
47+
`start` int(11) DEFAULT NULL,
48+
`end` int(11) DEFAULT NULL,
49+
INDEX `id` (`id`)) default charset = utf8 collate = utf8_bin row_format = compressed ;
50+
ALTER TABLE `temp_relations_SME_ADMIN_20181106` DISABLE KEYS;
51+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_strings` DISABLE KEYS;
52+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_rel` DISABLE KEYS;
53+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_head_rel` DISABLE KEYS;
54+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_dep_rel` DISABLE KEYS;
55+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_sentences` DISABLE KEYS;
56+
SET FOREIGN_KEY_CHECKS = 0;
57+
SET UNIQUE_CHECKS = 0;
58+
SET AUTOCOMMIT = 0;
59+
SET NAMES utf8;
60+
ALTER TABLE `temp_relations_SME_ADMIN_20181106` ENABLE KEYS;
61+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_strings` ENABLE KEYS;
62+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_rel` ENABLE KEYS;
63+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_head_rel` ENABLE KEYS;
64+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_dep_rel` ENABLE KEYS;
65+
ALTER TABLE `temp_relations_SME_ADMIN_20181106_sentences` ENABLE KEYS;
66+
DROP TABLE IF EXISTS `relations_SME_ADMIN_20181106`, `relations_SME_ADMIN_20181106_strings`, `relations_SME_ADMIN_20181106_rel`, `relations_SME_ADMIN_20181106_head_rel`, `relations_SME_ADMIN_20181106_dep_rel`, `relations_SME_ADMIN_20181106_sentences`;
67+
RENAME TABLE `temp_relations_SME_ADMIN_20181106` TO `relations_SME_ADMIN_20181106`, `temp_relations_SME_ADMIN_20181106_strings` TO `relations_SME_ADMIN_20181106_strings`, `temp_relations_SME_ADMIN_20181106_rel` TO `relations_SME_ADMIN_20181106_rel`, `temp_relations_SME_ADMIN_20181106_head_rel` TO `relations_SME_ADMIN_20181106_head_rel`, `temp_relations_SME_ADMIN_20181106_dep_rel` TO `relations_SME_ADMIN_20181106_dep_rel`, `temp_relations_SME_ADMIN_20181106_sentences` TO `relations_SME_ADMIN_20181106_sentences`;
68+
SET UNIQUE_CHECKS = 1;
69+
SET FOREIGN_KEY_CHECKS = 1;
70+
COMMIT;

0 commit comments

Comments
 (0)