Merge branch 'master' of https://github.com/intersystems/iknow

intersystems · Mar 4, 2022 · 7b58dce · 7b58dce
2 parents 78cffdb + 782cc2e
commit 7b58dce
Show file tree

Hide file tree

Showing 46 changed files with 110,540 additions and 107,331 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -29,7 +29,7 @@ env:
   JSON_INCLUDE: ${{ github.workspace }}/thirdparty/json/single_include
 
 jobs:
-  manylinux2010_x86_64:
+  manylinux2014_x86_64:
     runs-on: ubuntu-20.04
     outputs:
       REF_TESTING_PASSED: ${{ steps.tests.outputs.REF_TESTING_PASSED }}
@@ -57,7 +57,7 @@ jobs:
             ~/ccache
             ~/pipcache
       - name: build and run C++ unit tests
-        run: docker run --rm -e CCACHE_DIR=/ccache -e PIP_CACHE_DIR=/pipcache -e CCACHE_MAXSIZE=500M -e ICU_URL -e JSON_URL -e CYTHON_VERSION -v ~/ccache:/ccache -v ~/pipcache:/pipcache -v $GITHUB_WORKSPACE:/iknow quay.io/pypa/manylinux2010_x86_64:$MANYLINUX2010_X86_64_TAG /iknow/actions/build_manylinux.sh
+        run: docker container run --rm -e CCACHE_DIR=/ccache -e PIP_CACHE_DIR=/pipcache -e CCACHE_MAXSIZE=500M -e ICU_URL -e JSON_URL -e CYTHON_VERSION -v ~/ccache:/ccache -v ~/pipcache:/pipcache -v $GITHUB_WORKSPACE:/iknow quay.io/pypa/manylinux2014_x86_64:$MANYLINUX2014_X86_64_TAG /iknow/actions/build_manylinux.sh
       - name: upload wheel artifact
         uses: actions/upload-artifact@v2
         with:
@@ -107,7 +107,7 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y qemu-user-static binfmt-support
       - name: build and run C++ unit tests
-        run: docker run --rm -e CCACHE_DIR=/ccache -e PIP_CACHE_DIR=/pipcache -e CCACHE_MAXSIZE=500M -e ICU_URL -e JSON_URL -e CYTHON_VERSION -v ~/ccache:/ccache -v ~/pipcache:/pipcache -v $GITHUB_WORKSPACE:/iknow quay.io/pypa/manylinux2014_aarch64:$MANYLINUX2014_AARCH64_TAG /iknow/actions/build_manylinux.sh
+        run: docker container run --rm -e CCACHE_DIR=/ccache -e PIP_CACHE_DIR=/pipcache -e CCACHE_MAXSIZE=500M -e ICU_URL -e JSON_URL -e CYTHON_VERSION -v ~/ccache:/ccache -v ~/pipcache:/pipcache -v $GITHUB_WORKSPACE:/iknow quay.io/pypa/manylinux2014_aarch64:$MANYLINUX2014_AARCH64_TAG /iknow/actions/build_manylinux.sh
       - name: upload wheel artifact
         uses: actions/upload-artifact@v2
         with:
@@ -143,7 +143,7 @@ jobs:
           sudo apt-get update
           sudo apt-get install -y qemu-user-static binfmt-support
       - name: build and run C++ unit tests
-        run: docker run --rm -e CCACHE_DIR=/ccache -e PIP_CACHE_DIR=/pipcache -e CCACHE_MAXSIZE=500M -e ICU_URL -e JSON_URL -e CYTHON_VERSION -v ~/ccache:/ccache -v ~/pipcache:/pipcache -v $GITHUB_WORKSPACE:/iknow quay.io/pypa/manylinux2014_ppc64le:$MANYLINUX2014_PPC64LE_TAG /iknow/actions/build_manylinux.sh
+        run: docker container run --rm -e CCACHE_DIR=/ccache -e PIP_CACHE_DIR=/pipcache -e CCACHE_MAXSIZE=500M -e ICU_URL -e JSON_URL -e CYTHON_VERSION -v ~/ccache:/ccache -v ~/pipcache:/pipcache -v $GITHUB_WORKSPACE:/iknow quay.io/pypa/manylinux2014_ppc64le:$MANYLINUX2014_PPC64LE_TAG /iknow/actions/build_manylinux.sh
       - name: upload wheel artifact
         uses: actions/upload-artifact@v2
         with:
@@ -284,9 +284,9 @@ jobs:
             ${{ github.workspace }}/reference_materials/reports
 
   test_result:
-    needs: [manylinux2010_x86_64, manylinux2014_aarch64, manylinux2014_ppc64le, macosx_10_9_x86_64, windows_x86_64]
+    needs: [manylinux2014_x86_64, manylinux2014_aarch64, manylinux2014_ppc64le, macosx_10_9_x86_64, windows_x86_64]
     runs-on: ubuntu-20.04
-    if: needs.manylinux2010_x86_64.outputs.REF_TESTING_PASSED == '0' || needs.macosx_10_9_x86_64.outputs.REF_TESTING_PASSED == '0' || needs.windows_x86_64.outputs.REF_TESTING_PASSED == '0'
+    if: needs.manylinux2014_x86_64.outputs.REF_TESTING_PASSED == '0' || needs.macosx_10_9_x86_64.outputs.REF_TESTING_PASSED == '0' || needs.windows_x86_64.outputs.REF_TESTING_PASSED == '0'
     steps:
       - name: create comment
         uses: peter-evans/commit-comment@v1
@@ -298,7 +298,7 @@ jobs:
 
   deploy:
     if: (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.ref == 'refs/heads/master'
-    needs: [manylinux2010_x86_64, manylinux2014_aarch64, manylinux2014_ppc64le, macosx_10_9_x86_64, windows_x86_64]
+    needs: [manylinux2014_x86_64, manylinux2014_aarch64, manylinux2014_ppc64le, macosx_10_9_x86_64, windows_x86_64]
     runs-on: ubuntu-20.04
     env:
       PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
@@ -316,10 +316,10 @@ jobs:
           key: ${{ github.job }}-run-${{ github.run_number }}
           restore-keys: ${{ github.job }}-run-
           path: ~/.cache/pip
-      - name: download manylinux2010_x86_64-wheel
+      - name: download manylinux2014_x86_64-wheel
         uses: actions/download-artifact@v2
         with:
-          name: manylinux2010_x86_64-wheel
+          name: manylinux2014_x86_64-wheel
           path: ~/wheels
       - name: download manylinux2014_aarch64-wheel
         uses: actions/download-artifact@v2

diff --git a/actions/build_manylinux.sh b/actions/build_manylinux.sh
@@ -28,6 +28,8 @@ if [ "$PROCESSOR" = aarch64 ] || [ "$PROCESSOR" = ppc64le ]; then
   yum install -y epel-release
   # this mirror is often slow, so disable it
   echo "exclude=csc.mcs.sdsmt.edu" >> /etc/yum/pluginconf.d/fastestmirror.conf
+elif [ "$PROCESSOR" = x86_64 ]; then
+  echo "exclude=mirror.es.its.nyu.edu" >> /etc/yum/pluginconf.d/fastestmirror.conf
 fi
 yum install -y dos2unix ccache
 mkdir -p /opt/ccache
@@ -36,11 +38,6 @@ ln -s /usr/bin/ccache /opt/ccache/c++
 ln -s /usr/bin/ccache /opt/ccache/gcc
 ln -s /usr/bin/ccache /opt/ccache/g++
 export PATH="/opt/ccache:$PATH"
-if [ "$PROCESSOR" = x86_64 ]; then
-  # On manylinux2010_x86_64, the version of ccache is too old to recognize the
-  # CCACHE_MAXSIZE environment variable, so set the max cache size manually.
-  ccache --max-size "$CCACHE_MAXSIZE"
-fi
 
 
 ##### Build ICU if it's not cached #####
@@ -53,7 +50,7 @@ if ! [ -f "$ICUDIR/iknow_icu_url.txt" ] || [ $(cat "$ICUDIR/iknow_icu_url.txt")
 
   dos2unix -f *.m4 config.* configure* *.in install-sh mkinstalldirs runConfigureICU
   export CXXFLAGS="-std=c++11"
-  PYTHON=/opt/python/cp39-cp39/bin/python ./runConfigureICU Linux --prefix="$ICUDIR"
+  PYTHON=/opt/python/cp310-cp310/bin/python ./runConfigureICU Linux --prefix="$ICUDIR"
   gmake -j $(nproc)
   gmake install
   echo "$ICU_URL" > "$ICUDIR/iknow_icu_url.txt"

diff --git a/actions/dependencies.sh b/actions/dependencies.sh
@@ -20,9 +20,9 @@ PYVERSIONS_OSX="3.6.15 3.7.12 3.8.12 3.9.10 3.10.2"
 BUILDCACHE_NAME="Release v0.27.6"
 BUILDCACHE_URL_WIN=https://github.com/mbitsnbites/buildcache/releases/download/v0.27.6/buildcache-windows.zip
 CYTHON_VERSION=0.29.28
-MANYLINUX2010_X86_64_TAG=2022-02-13-594988e
-MANYLINUX2014_AARCH64_TAG=2022-02-13-594988e
-MANYLINUX2014_PPC64LE_TAG=2022-02-13-594988e
+MANYLINUX2014_X86_64_TAG=2022-02-27-769bdbd
+MANYLINUX2014_AARCH64_TAG=2022-02-27-769bdbd
+MANYLINUX2014_PPC64LE_TAG=2022-02-27-769bdbd
 # END DEPENDENCY-AUTOUPDATE SECTION
 
 
@@ -57,6 +57,6 @@ fi
 
 # set variables that will be needed in later steps
 echo "CYTHON_VERSION=$CYTHON_VERSION" >> $GITHUB_ENV
-echo "MANYLINUX2010_X86_64_TAG=$MANYLINUX2010_X86_64_TAG" >> $GITHUB_ENV
+echo "MANYLINUX2014_X86_64_TAG=$MANYLINUX2014_X86_64_TAG" >> $GITHUB_ENV
 echo "MANYLINUX2014_AARCH64_TAG=$MANYLINUX2014_AARCH64_TAG" >> $GITHUB_ENV
 echo "MANYLINUX2014_PPC64LE_TAG=$MANYLINUX2014_PPC64LE_TAG" >> $GITHUB_ENV
diff --git a/actions/update_manylinux.py b/actions/update_manylinux.py
@@ -8,13 +8,13 @@
 
 
 vars = updatelib.get_vars()
-manylinux2010_x86_64_tag = vars['MANYLINUX2010_X86_64_TAG']
+manylinux2014_x86_64_tag = vars['MANYLINUX2014_X86_64_TAG']
 manylinux2014_aarch64_tag = vars['MANYLINUX2014_AARCH64_TAG']
 manylinux2014_ppc64le_tag = vars['MANYLINUX2014_PPC64LE_TAG']
 
 # send Quay API requests and parse responses
 tags = []
-for repo in ('manylinux2010_x86_64', 'manylinux2014_aarch64', 'manylinux2014_ppc64le'):
+for repo in ('manylinux2014_x86_64', 'manylinux2014_aarch64', 'manylinux2014_ppc64le'):
     r = requests.get(f'https://quay.io/api/v1/repository/pypa/{repo}/image')
     json_data = r.json()
     for image in reversed(json_data['images']):
@@ -30,15 +30,15 @@
     tags.append(tag)
 
 # set variables to latest ICU version
-vars['MANYLINUX2010_X86_64_TAG'] = tags[0]
+vars['MANYLINUX2014_X86_64_TAG'] = tags[0]
 vars['MANYLINUX2014_AARCH64_TAG'] = tags[1]
 vars['MANYLINUX2014_PPC64LE_TAG'] = tags[2]
 updatelib.set_vars(vars)
 
 # set environment variables for next GitHub actions step
 message = []
-if manylinux2010_x86_64_tag != tags[0]:
-    message.append(['manylinux2010_x86_64', manylinux2010_x86_64_tag, tags[0]])
+if manylinux2014_x86_64_tag != tags[0]:
+    message.append(['manylinux2014_x86_64', manylinux2014_x86_64_tag, tags[0]])
 if manylinux2014_aarch64_tag != tags[1]:
     message.append(['manylinux2014_aarch64', manylinux2014_aarch64_tag, tags[1]])
 if manylinux2014_ppc64le_tag != tags[2]:

diff --git a/actions/updatelib.py b/actions/updatelib.py
@@ -82,7 +82,7 @@ def setenv(name, value):
     'BUILDCACHE_NAME',
     'BUILDCACHE_URL_WIN',
     'CYTHON_VERSION',
-    'MANYLINUX2010_X86_64_TAG',
+    'MANYLINUX2014_X86_64_TAG',
     'MANYLINUX2014_AARCH64_TAG',
     'MANYLINUX2014_PPC64LE_TAG'
 }

diff --git a/language_development/en b/language_development/en
diff --git a/language_development/find_examples_for_rule_with_udct.py b/language_development/find_examples_for_rule_with_udct.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mar 1 2022
+@author: sdebergh
+
+# This Python file uses the following encoding: utf-8
+
+    find_examples_for_rule.py is a tool to find sentences in which a given rule is applied. 
+    Usage: "python find_examples_for_rule_with_udct.py <text files directory> <output file> <language> <rule number> <user dictionary>"
+    Example (on Windows): "python find_examples_for_rule_with_udct.py C:\TextCorpus\English\Financial\ C:\output\ en 531 C:\\repos\\iKnow\\reference_materials\\udct_test_dictionaries\\en_udct.txt"
+                          -> find examples for rule 531 of the English language model
+"""
+
+import sys, os
+
+# do "pip install iknowpy" if iknowpy is not installed
+import iknowpy
+
+# read command line
+in_path_par = sys.argv[1]
+out_path_par = sys.argv[2]
+language_par = sys.argv[3]
+rule_number = sys.argv[4]
+user_dct_par = sys.argv[5]
+
+# functions
+# add a line in the output file
+def write_ln(file_,text_):
+    file_.write((text_+"\r\n").encode('utf8'))
+
+# create a mapping table for rule numbers, based on xx_compiler_report.log
+def create_mapping_table(mapping_file):
+    read_mapping_file = open(mapping_file, encoding='utf-8')
+    for line in read_mapping_file:
+        if line != '\n':
+            mapping = line.split()[0]
+        if ':' in mapping:
+            mapping_table[mapping.split(':')[0]] = mapping.split(':')[1]
+
+# find the matching number in the mapping table
+def extract_rule_id(rule_order):
+    rule_id = mapping_table[rule_order]
+    return rule_id
+
+# read user dictionary
+def read_udct_file(file_,udct_):
+    f_udct = open(file_,"r",True,"utf8")
+    for txt_line in f_udct:
+        # print('txt_line: ' + txt_line)
+        txt_line = txt_line.rstrip()
+
+        if ',' in txt_line and txt_line[0:2] != '/*':
+            txt_list = txt_line.split(',')
+            lexrep, action = txt_list[0], txt_list[1]
+            if (lexrep[0] == '@'):
+                literal = lexrep[1:]
+                if action == "UDCertainty":
+                    level = txt_list[2]
+                    udct_.add_certainty_level(literal,int(level[2]))
+                else:
+                    ret = udct_.add_label(literal,action)
+                    if (ret == -2):
+                        print('label ' + action + ' not valid !')
+            else: # Set end = $SELECT(command = "\end":1,command = "\noend":0,1:..Err())
+                if action == "\\end":
+                    udct_.add_sent_end_condition(lexrep, True)
+                elif action == "\\noend":
+                    udct_.add_sent_end_condition(lexrep, False)
+                else:
+                    print('action ' + action + ' not valid !')
+
+    f_udct.close()
+
+
+
+# initiate variables  
+mapping_file = language_par + "_compiler_report.log" # detect applicable xx_compiler_report.log based on language code
+mapping_table = {}   
+f_rec = []
+engine = iknowpy.iKnowEngine()
+
+# load user dictionary
+user_dictionary = iknowpy.UserDictionary()
+read_udct_file(user_dct_par, user_dictionary)
+ret = engine.load_user_dictionary(user_dictionary)
+
+
+print('Looking for examples for rule ' + rule_number + ' of the ' + language_par + ' language model in ' + in_path_par)
+
+
+
+
+# make a list of input file (recursive list of files, .txt only) - copied from https://stackoverflow.com/questions/18394147/recursive-sub-folder-search-and-return-files-in-a-list-python
+f_rec = [os.path.join(dp, f) for dp, dn, filenames in os.walk(in_path_par) for f in filenames if
+                  os.path.splitext(f)[1].lower() == '.txt']
+
+
+# create mapping table for rule numbers
+create_mapping_table(mapping_file)
+
+
+# open output file and add UTF-8 BOM and information about the content of the file
+if os.path.exists(out_path_par):
+        os.remove(out_path_par)
+f_output = open(out_path_par, "ab")
+f_output.write(b'\xef\xbb\xbf') # Utf8 BOM
+write_ln(f_output, 'Examples for rule ' + rule_number + ' of the ' + language_par + ' language model in ' + in_path_par + '\n')
+
+# read input files one by one
+for text_file in f_rec:
+    print('processing ' + text_file)
+    f_text = open(text_file, "rb")
+    header = f_text.read(3)
+    if (header == b'\xef\xbb\xbf'): # check for Utf8 BOM
+        header = b''    # remove BOM
+    text = header + f_text.read() # read text, must be utf8 encoded
+    text = text.decode('utf8') # decode text to Unicode
+    f_text.close()
+
+    # index input file
+    engine.index(text, language_par, traces=True)
+
+    # read trace output
+    for trace in engine.m_traces:
+#        print(trace)
+        key, value = trace.split(':', 1)[0],trace.split(':', 1)[1]
+        # store the sentence
+        if (key == "SentenceFound"):
+            Sentence = value.split('"')[7]
+            if len(value.split('"')) > 9:  # i.e. if the sentence contains quotes (")
+                for i in range(8, len(value.split('"')) - 1):
+                    Sentence = Sentence + value.split('"')[i]
+
+        # check if the demanded rule is applied to process the sentence    
+        elif (key == "RuleApplication"):
+            # rule_id in trace refers actually to rule order -> retrieve rule order value
+            rule_order = value.split(';')[0].split('=')[1]
+            # extract the number that corresponds to the rule id in rules.csv from compiler_report.log
+            rule_id = extract_rule_id(rule_order)
+            # if the rule id corresponds to the demanded rule number, look for the concerned lexreps 
+            if rule_id == rule_number:
+                lexreps = value.split(';')[3:]
+                lexreps = str(lexreps)
+                lexreps_indexes = ''
+                while 'index=' in lexreps:
+                    lexreps_indexes = lexreps_indexes + ' ' + lexreps[lexreps.find('index=\"')+7:lexreps.find('labels=')-2]
+                    lexreps = lexreps[lexreps.find('labels=')+7:]  # cut off left part of lexreps information in order to julp to the next lexrep
+                # add the concerned lexrep(s) and the sentence to the output
+                #print(lexreps_indexes.lstrip())
+                write_ln(f_output, lexreps_indexes.lstrip() + ';' + Sentence)
+
+
+f_output.close()
diff --git a/language_development/genTrace-with-udct.py b/language_development/genTrace-with-udct.py
@@ -156,6 +156,7 @@ def read_udct_file(file_,udct_):
             write_ln(f_trace, key + ':' + updated_value)
 #            pass
         elif (key == "RuleApplicationResult"):  # use this only when the code for 'RuleApplication' is activated too
+            updated_value = 'rule_id=' + rule_id + value[first_semicolon:]
             write_ln(f_trace, key + ':' + updated_value)
             # pass
         elif (key == "JoinResult"):

diff --git a/language_models/en/labels.csv b/language_models/en/labels.csv
@@ -247,6 +247,10 @@
 ;1,75,$;ENDuration;typeAttribute;attribute for duration indications;0;;Entity(Duration)
 ;1,$;ENTemp;typeAttribute;attribute for vague time indications;0;;Entity(DateTime)
 ;1,55,75,$;ENTime;typeAttribute;attribute for concrete time indications;0;;Entity(DateTime)
+;1,75,$;ENDateTimeBegin;typeAttribute;added for enabling path expansion;0;;Path(Begin,DateTime)
+;1,75,$;ENDateTimeStop;typeAttribute;added for enabling path expansion;0;;Path(End,DateTime)
+;1,75,$;ENMeasurementBegin;typeAttribute;added for enabling path expansion;0;;Path(Begin,Measurement)
+;1,75,$;ENMeasurementStop;typeAttribute;added for enabling path expansion;0;;Path(End,Measurement)
 
 ;1,62,65,70,71,75,$;ENDummy;typeAttribute;dummy attribute to temporarily mark lexreps;0;;
 ;40,55,65,70,71,77,78,79;ENDummy2;typeAttribute;extra dummy attribute;0;;
@@ -295,10 +299,6 @@
 ;75,$;ENNoMeasurement;typeAttribute;mark numbers that are not measurements;0;;
 ;75;ENInMeasspan;typeAttribute;marker for lexreps in a measurement span;0;;
 
-;1,75,$;ENDateTimeBegin;typeAttribute;added for enabling path expansion;0;;Path(Begin,DateTime)
-;1,75,$;ENDateTimeStop;typeAttribute;added for enabling path expansion;0;;Path(End,DateTime)
-;1,75,$;ENMeasurementBegin;typeAttribute;added for enabling path expansion;0;;Path(Begin,Measurement)
-;1,75,$;ENMeasurementStop;typeAttribute;added for enabling path expansion;0;;Path(End,Measurement)
 
 ;75,$;ENNumberPlusUnit2;typeAttribute;copy of NumberPlusUnit, not deleted in join rules;0;;