Skip to content

Commit

Permalink
Extend Measurement detection in Swedish (#211)
Browse files Browse the repository at this point in the history
Focus on number+unit in one string, e.g. 20mg,
and on ranges, e.g. 218-263 GHz.
  • Loading branch information
ISC-SDE committed Mar 15, 2022
1 parent 9b37bf9 commit e6a5d46
Show file tree
Hide file tree
Showing 33 changed files with 22,643 additions and 21,348 deletions.
5 changes: 4 additions & 1 deletion language_models/sv/labels.csv
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,14 @@
;1,15,30,40,45,65,70,75,76,80,$;SVbCon;typeBeginConcept;first word of a Concept;0;;
;1,65,70,75,76,80,85,$;SVbRel;typeBeginRelation;first word of a Relation;0;;
;1,30;SVClocktime;typeConcept;hh.mm;0;;
;1,30;SVClocktime2;typeConcept;hh:mm -> can be time or price;0;;
;1,30,35;SVClocktime2;typeConcept;hh:mm -> can be time or price;0;;
;1,35;SVColonnumber;typeConcept;d(d+):d+, not in clock time range;0;;
;30,40,60,65,70,75,76,$;SVComma;typeOther;comma;0;;
;1,15,40,75,76,$;SVConIfCap;typeConcept;capitalized noun or name;0;;
;1,15,40,75,76,$;SVConIfAllCap;typeConcept;acronym in upper case;0;;
;1,15,40,45;SVConpart1;typeConcept;first part of a concept;0;;
;1,15,25,40,60,65,70,75,76,80,85,$;SVConj;typeRelation;conjunction;0;;
;1,35;SVCurrency;typeConcept;currency name;0;;
;1,45,60,65,70;SVCPron;typeConcept;extra label for Concept-pronouns;0;;
;1,15,30,45,50,75,76,$;SVDay;typeConcept;name of day;0;;
;1,30,40,75,76;SVDecinum;typeConcept;extra label for decimal numbers;0;;
Expand Down Expand Up @@ -98,6 +100,7 @@
;1,30;SV3dNum;typeConcept;3 digits, can be part of larger number;0;;
;1,25,30,35,40,45,65,70,75,76,80,$;SVNum;typeConcept;number written in digits;0;;
;1,15,25,30,35,40,45,65,75,76,$;SVNumber;typeConcept;number;0;;
;1,35,$;SVNumberPlusUnit;typeConcept;general label for all numbers plus units without space;0;;Entity(Measurement,Value,Unit)
;1,30;SVNumpart2;typeConcept;plural numbers like 'miljoner';0;;
;1,15,40,45,60,65,70,75,76,80,$;SVObjpron;typePathRelevant;object form of personal pronoun;0;;
;1,15,30,40,45,75,76,$;SVOrdnumber;typeConcept;ordinal number;0;;
Expand Down
53 changes: 38 additions & 15 deletions language_models/sv/lexreps.csv
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,6 @@
;;(1|2|3|4|5|6|7|8|9)(0|1|2|3|4|5|6|7|8|9)-årige man;;SVAdj;-;SVCon;




/* Lexreps for special characters

;;\(1\);;SVLayout;
Expand Down Expand Up @@ -96330,10 +96328,12 @@
;;centimeter;;SVUnit;SVCon;
;;cm;;SVUnit;SVCon;
;;deciliter;;SVUnit;SVCon;
;;dollar;;SVUnit;SVCon;
;;euro;;SVUnit;SVCon;
;;dollar;;SVCurrency;SVUnit;SVCon;
;;dollarn;;SVCurrency;SVCon;
;;euro;;SVCurrency;SVUnit;SVCon;
;;euron;;SVCurrency;SVCon;
;;fot;;SVUnit;SVCon;
;;franc;;SVUnit;SVCon;
;;franc;;SVCurrency;SVUnit;SVCon;
;;g;;SVUnit;SVCon;
;;g/l;;SVUnit;SVCon;
;;gbps;;SVUnit;SVCon;
Expand All @@ -96348,7 +96348,8 @@
;;gram;;SVUnit;SVCon;
;;hektar;;SVUnit;SVCon;
;;hertz;;SVUnit;SVCon;
;;hongkongdollar;;SVUnit;SVCon;
;;hongkongdollar;;SVCurrency;SVUnit;SVCon;
;;hryvnia;;SVCurrency;SVUnit;SVCon;
;;hz;;SVUnit;SVCon;
;;kalorier;;SVUnit;SVCon;
;;kilo;;SVUnit;SVCon;
Expand All @@ -96364,8 +96365,8 @@
;;kr;;SVUnit;SVCon;
;;kr/hektar;;SVUnit;SVCon;
;;kr/kg;;SVUnit;SVCon;
;;krona;;SVUnit;SVCon;
;;kronor;;SVUnit;SVCon;
;;krona;;SVCurrency;SVUnit;SVCon;
;;kronor;;SVCurrency;SVUnit;SVCon;
;;kvadratmeter;;SVUnit;SVCon;
;;l;;SVUnit;SVCon;
;;l/min;;SVUnit;SVCon;
Expand Down Expand Up @@ -96407,13 +96408,13 @@
;;procent;;SVUnit;SVCon;
;;procentenhet;;SVUnit;SVCon;
;;procentenheter;;SVUnit;SVCon;
;;pund;;SVUnit;SVCon;
;;riksdaler;;SVUnit;SVCon;
;;pund;;SVCurrency;SVUnit;SVCon;
;;riksdaler;;SVCurrency;SVUnit;SVCon;
;;slag/minut;;SVUnit;SVCon;
;;ton;;SVUnit;SVCon;
;;yen;;SVUnit;SVCon;
;;öre;;SVUnit;SVCon;
;;ören;;SVUnit;SVCon;
;;yen;;SVCurrency;SVUnit;SVCon;
;;öre;;SVCurrency;SVUnit;SVCon;
;;ören;;SVCurrency;SVUnit;SVCon;
;;års ålder;;SVUnit;SVCon;SVAge;
;;µg;;SVUnit;SVCon;
;;µm;;SVUnit;SVCon;
Expand Down Expand Up @@ -98529,8 +98530,6 @@
;;{othernumber};;SVNum;

;;{range01};1digit-1digit;SVNumber;
;;{range02};1digit-2digits;SVNumber;
;;{range03};2digit-2digits;SVNumber;

;;{noun01};-[aeo]rna;SVCon;SVPluralnoun;

Expand All @@ -98552,3 +98551,27 @@
;;{time02};hh:mm;SVCon;SVClocktime2;
;;{time03};hh[.:]mm-tiden;SVCon;SVClocktime;


/* Measurements
;;{meas01};;SVCon;SVNumberPlusUnit;
;;{meas02};;SVCon;SVNumberPlusUnit;
;;{meas03};;SVCon;SVNumberPlusUnit;
;;{meas04};;SVCon;SVNumberPlusUnit;
;;{meas05};percentage with decimal number with period;SVCon;SVNumberPlusUnit;
;;{othernumber}(°c|°|%|grader|g|k|l|m|s|t);;SVCon;SVNumberPlusUnit;
;;{othernumber}(k|m|n|µ)(g|k|l|m);;SVCon;SVNumberPlusUnit;
;;{othernumber}(k|m|n|µ)(g|k|l|m)/(grader|h|kg|km|l|m2|m3|m²|m³|m);;SVCon;SVNumberPlusUnit;
;;{othernumber}år;;SVCon;SVAge;SVNumberPlusUnit;
;;{numberdecimal}(°c|°|%|grader|g|k|l|m|s|t);;SVCon;SVNumberPlusUnit;
;;{numberdecimal}(k|m|n|µ)(g|k|l|m);;SVCon;SVNumberPlusUnit;
;;{numberdecimal}(k|m|n|µ)(g|k|l|m)/(grader|h|kg|km|l|m2|m3|m²|m³|m);;SVCon;SVNumberPlusUnit;
;;{numberdecimal}år;;SVCon;SVAge;SVNumberPlusUnit;
;;{range01}(°c|°|%|grader|g|k|l|m|s|t);;SVCon;SVNumberPlusUnit;
;;{range01}(k|m|n|µ)(g|k|l|m);;SVCon;SVNumberPlusUnit;
;;{range01}(k|m|n|µ)(g|k|l|m)/(grader|h|kg|km|l|m2|m3|m²|m³|m);;SVCon;SVNumberPlusUnit;
;;{range01}år;;SVCon;SVAge;SVNumberPlusUnit;

;;{meas06};dd:dd but not in clock time range;SVColonnumber;SVCon;
;;{meas07};dd:dd but not in clock time range;SVColonnumber;SVCon;
;;{meas08};ddd:d+ -> not in clock time range;SVColonnumber;SVCon;

1 change: 1 addition & 0 deletions language_models/sv/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
ConceptsToMergeMax;8;
LanguageCode;sv
PathConstruction;PR;
ValUnitRegexSplitter;([a-zA-Z$€Â£¥]+)?([0-9\.,\- /]*[0-9])[ -]?([a-zA-Zäöå%°]*);
18 changes: 14 additions & 4 deletions language_models/sv/regex.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,16 @@ numberordinal;\d+:e
/* Remaining numbers
othernumber;[\-\<\>\+]?\d+

/* Range < 100
range01;[0-9]\-[0-9]
range02;[0-9]\-[0-9][0-9]
range03;[0-9][0-9]\-[0-9][0-9]
/* Measurements
meas01;\d+[gklmst]
meas02;\d+[mkµn][gklm]
meas03;\d+år
meas04;\d+[mkµn][gklm]\/(h|kg|km|l|m2|m3|m²|m³|m)
meas05;\d+\.\d+%


/* Range < 1000
range01;\d\d?\d?\-\d\d?\d?

/* Big numbers
/*bignumber01;\d\d\d\d+
Expand All @@ -37,6 +43,10 @@ time01;[012]?[0-9]\.[0-5][0-9]
time02;[012]?[0-9]:[0-5][0-9]
time03;[012]?[0-9][\.:][0-5][0-9]-tiden

/* Amount of money
meas06;[3-9][0-9]:[0-9][0-9]
meas07;\d{1,2}:[6-9][0-9]
meas08;\d\d\d+:\d+

/* Layout
layout01;[#*\-+=][#*\-+=][#*\-+=][#*\-+=]+
Expand Down
7 changes: 7 additions & 0 deletions language_models/sv/rules.csv
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,10 @@
/* 1385. Man räknar med att 25 till 30 procent av världens utsläpp av växthusgaser kommer från jordbruk.
1377;35;SVNum:SVNumber|SVUnit;+Measurement+ValueProperty|+Measurement+UnitProperty;
/* 1377. Dagstemperaturer på över 25 °C (s.k. högsommardagar) inträffar i genomsnitt 16 gånger per år.
1388;35;SVColonnumber:SVClocktime2+^SVTime|SVCurrency;+SVNumber+Measurement+ValueProperty|+Measurement+UnitProperty;
/* 1388. Ukrainas hryvnia var på en historisk bottennivå i måndags, med en kurs på 11:65 hryvnia per dollar.
1389;35;SVColonnumber:SVClocktime2+^SVTime|SVPrep|SVCurrency;+SVNumber+Measurement+ValueProperty|*|*;
/* 1389. Under förmiddagen har kronan försvagats 3 öre till 6:55 mot dollarn, medan den har försvagats 4 öre till 8:91 mot euron.
1380;35;"över":"under"+SVAgeprep|SVNum:SVNumber|SVAge;JoinReverse|*|*;
/* 1380. Med barn avses varje människa under 18 år.
1384;35;SVPrep|"över":"under"|SVNum;*|JoinReverse|*;
Expand All @@ -441,6 +445,9 @@
/* 1379. Projektet saknar mätningar på barn i åldern mellan 5 och 6 månader.
1378;35;*SVNummodifier|ValueProperty;JoinReverse|+SVbCon;
/* 1378. I fjol inbringade välgörenhetssatsningen drygt 23,3 miljoner kronor för rent vatten i slumområden och i år växlade givarna upp ytterligare fem miljoner kronor.
/*1390;35;*SVNummodifier|SVNumberPlusUnit;JoinReverse|*; -> ValUnitRegexSplitter makes the Nummodifier part of the unit, e.g. value 30, unit ca%
1390;35;*SVNummodifier|SVNumberPlusUnit;-SVPrep+SVAdvDegree|*;
/* 1390. Ca 30% blir symtomfria efter en vecka utan antibiotikabehandling.
1383;35;SVNum:SVNumber|SVUnit|SVPostmeas;*|*|+Join;
/* 1383. Motsvarande siffra för Frankfurtbörsen ligger på en nedgång om 0,2 procent medan Parisbörsen ser ut att gå mot en öppning 0,2 procent högre.

Expand Down
Loading

0 comments on commit e6a5d46

Please sign in to comment.