From 0524c660cade4606c253633a81ad52f7c37056dc Mon Sep 17 00:00:00 2001
From: ericwimsatt <ericwimsatt@gmail.com>
Date: Sun, 5 Jun 2022 16:18:43 -0700
Subject: [PATCH 1/3] adds junk text categoryu

---
 consensus_and_scoring/TriagerScoring.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/consensus_and_scoring/TriagerScoring.py b/consensus_and_scoring/TriagerScoring.py
index b2f1cdc..54fd51f 100644
--- a/consensus_and_scoring/TriagerScoring.py
+++ b/consensus_and_scoring/TriagerScoring.py
@@ -225,6 +225,11 @@ def determinePassingIndices(starts, ends, numUsers, users, length, category):
                 'passingFunc': evalThresholdMatrix,
                 'scale': 1.8
             },
+            'Junk Text':
+            {
+                'passingFunc': ignoreThis,
+                'scale': 0
+            },
     }
     passFunc = actionDeterminant[category]['passingFunc']
     scale = actionDeterminant[category]['scale']
@@ -245,6 +250,9 @@ def findPassingIndices(starts, ends, numUsers, users, length, passingFunc = eval
             passersArray[i] = 1
     return passersArray
 
+def ignoreThis(percent, TotalNumUsers, scale):
+    return 'X'
+
 def minPercent(percent, totalNumUsers, scale):
     if percent>=scale:
         return 'H'

From 8dd6a9bb8e1e09370c1f9c23c2b5f0a43889de31 Mon Sep 17 00:00:00 2001
From: ericwimsatt <ericwimsatt@gmail.com>
Date: Sun, 12 Jun 2022 20:29:10 -0700
Subject: [PATCH 2/3] adds texts to s3 import for highlighter; now reproing the
 issue

---
 consensus_and_scoring/TriagerScoring.py | 27 +++++++++++++++++++++----
 consensus_and_scoring/app.py            |  3 +++
 consensus_and_scoring/process_dirs.py   |  3 ++-
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/consensus_and_scoring/TriagerScoring.py b/consensus_and_scoring/TriagerScoring.py
index 54fd51f..3bafa9c 100644
--- a/consensus_and_scoring/TriagerScoring.py
+++ b/consensus_and_scoring/TriagerScoring.py
@@ -15,7 +15,7 @@
 jpath1 = 'FormTriager1.2C2-2018-07-25T23.json'
 jpath2 = 'SemanticsTriager1.3C2-2018-07-25T23.json'
 
-def importData(path, out_path):
+def importData(path, out_path, texts_dir = None):
     '''
 
     :param path: location of the triage data
@@ -47,6 +47,22 @@ def importData(path, out_path):
         #flagExclusions = exclusionList(users, flags, cats)
         flagExclusions = []
         #print(flagExclusions)
+
+        #try to handle texts if path is provided:
+        if texts_dir:
+            text_file = os.path.join(texts_dir, a + ".txt")
+            if not(os.path.exists(text_file)):
+                for root, dir, files in os.walk(text_file):
+                    for file in files:
+                        print(file)
+                raise  Exception("Couldn't find text_file for article {}".format(text_file))
+
+            if text_file == None:
+                raise Exception("Couldn't find text_file for article", a)
+
+            with open(text_file, 'r', encoding='utf-8') as file:
+                source_text = file.read()
+
         if annotator_count >= STRICT_MINIMUM_CONTRIBUTORS:
             cats = np.unique(art_data['topic_name'])
             for c in cats:
@@ -62,7 +78,8 @@ def importData(path, out_path):
                 texts = cat_data['target_text'].str.decode('unicode-escape').tolist()
 
                 print('//Article:', a, 'Category:', c, 'numUsers:', numUsers)
-                source_text = addToSourceText(starts, ends, texts, source_text)
+                if texts_dir is None:
+                    source_text = addToSourceText(starts, ends, texts, source_text)
                 pstarts, pends, pflags = scoreTriager(starts, ends, users, numUsers, flags, length, c, flagExclusions)
                 out = appendData(filename[0], a, task_uuids, namespaces, pstarts, pends, c, pflags, out, source_text)
 
@@ -388,7 +405,9 @@ def load_args():
 
 if __name__ == '__main__':
     args = load_args()
-    input_file = '../data/highlighter/ESTF_HardTriage-2021-05-14T0016-Highlighter.csv'
+    input_file = '../data/highlighter/DK_off.csv'
+    texts_dir = '../data/texts/'
+
     if args.input_file:
         input_file = args.input_file
     dirname = os.path.dirname(input_file)
@@ -398,4 +417,4 @@ def load_args():
         output_file = args.output_file
     print("Input: {}".format(input_file))
     print("Output: {}".format(output_file))
-    importData(input_file, output_file)
+    importData(input_file, output_file, texts_dir)
diff --git a/consensus_and_scoring/app.py b/consensus_and_scoring/app.py
index 08ac9b0..4f7251c 100644
--- a/consensus_and_scoring/app.py
+++ b/consensus_and_scoring/app.py
@@ -110,6 +110,9 @@ def fetch_tags_files(body, dir_dict):
 def fetch_highlighter_files(body, dir_dict):
     highlighters = body.get('Highlighters', [])
     retrieve_file_list(highlighters, dir_dict['highlighters_dir'])
+    texts = body.get('Texts', [])
+    texts = use_article_sha256_filenames(texts)
+    retrieve_file_list(texts, dir_dict['texts_dir'])
     logger.info("---FILES RETRIEVED SUCCESSFULLY in request_highlighter_consensus handler---")
 
 def fetch_datahunt_files(body, dir_dict):
diff --git a/consensus_and_scoring/process_dirs.py b/consensus_and_scoring/process_dirs.py
index 9fb6f65..01a29cc 100644
--- a/consensus_and_scoring/process_dirs.py
+++ b/consensus_and_scoring/process_dirs.py
@@ -33,6 +33,7 @@ def configure_consensus_directories(task_type, parent_dirname):
     if task_type == "HLTR":
         dir_dict['highlighters_dir'] = make_dir(parent_dirname, 'highlighters')
         dir_dict['consensus_dir']= make_dir(parent_dirname, "output_HLTR_consensus")
+        dir_dict['texts_dir'] = make_dir(parent_dirname, 'texts')
         clean_output_csvs(dir_dict['consensus_dir'])
     elif task_type == "QUIZ":
         dir_dict['config_path'] = './config/'
@@ -54,7 +55,7 @@ def generate_highlighter_consensus(dir_dict):
         if filename.endswith(".csv"):
             input_file = os.path.join(highlighters_dir, filename)
             output_file = os.path.join(consensus_dir, "S_IAA_" + filename)
-            importData(input_file, output_file)
+            importData(input_file, output_file, dir_dict['texts_dir'])
 
 def generate_datahunt_consensus(dir_dict):
     uuids_to_filter = read_filter_uuids('./data_patches/')

From d2a0d3094717d4357aca6436a69f5866a3a22ff0 Mon Sep 17 00:00:00 2001
From: ericwimsatt <ericwimsatt@gmail.com>
Date: Mon, 13 Jun 2022 22:15:33 -0700
Subject: [PATCH 3/3] maybe progress?

---
 consensus_and_scoring/TriagerScoring.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/consensus_and_scoring/TriagerScoring.py b/consensus_and_scoring/TriagerScoring.py
index 3bafa9c..e72adfc 100644
--- a/consensus_and_scoring/TriagerScoring.py
+++ b/consensus_and_scoring/TriagerScoring.py
@@ -75,10 +75,10 @@ def importData(path, out_path, texts_dir = None):
                 namespaces = cat_data['namespace'].tolist()
 
                 length = floor(cat_data['article_text_length'].tolist()[0])
-                texts = cat_data['target_text'].str.decode('unicode-escape').tolist()
 
                 print('//Article:', a, 'Category:', c, 'numUsers:', numUsers)
                 if texts_dir is None:
+                    texts = cat_data['target_text'].str.decode('unicode-escape').tolist()
                     source_text = addToSourceText(starts, ends, texts, source_text)
                 pstarts, pends, pflags = scoreTriager(starts, ends, users, numUsers, flags, length, c, flagExclusions)
                 out = appendData(filename[0], a, task_uuids, namespaces, pstarts, pends, c, pflags, out, source_text)
@@ -95,7 +95,7 @@ def appendData(article_filename, article_sha256, task_uuids, namespaces,start_po
         case_numbers = np.zeros(len(start_pos_list))
     for i in range(len(start_pos_list)):
         text = getText(start_pos_list[i], end_pos_list[i],source_text)
-        text = text.encode('unicode-escape').decode('utf-8')
+        #text = text.encode('unicode-escape').decode('utf-8')
         #print(len(namespaces), len(start_pos_list), len(end_pos_list), len(case_numbers))
         data.append([article_filename, article_sha256, task_uuids[i], namespaces[i], start_pos_list[i], end_pos_list[i], topic_name, int(case_numbers[i]), text])
     return data