From 0524c660cade4606c253633a81ad52f7c37056dc Mon Sep 17 00:00:00 2001 From: ericwimsatt Date: Sun, 5 Jun 2022 16:18:43 -0700 Subject: [PATCH 1/3] adds junk text categoryu --- consensus_and_scoring/TriagerScoring.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/consensus_and_scoring/TriagerScoring.py b/consensus_and_scoring/TriagerScoring.py index b2f1cdc..54fd51f 100644 --- a/consensus_and_scoring/TriagerScoring.py +++ b/consensus_and_scoring/TriagerScoring.py @@ -225,6 +225,11 @@ def determinePassingIndices(starts, ends, numUsers, users, length, category): 'passingFunc': evalThresholdMatrix, 'scale': 1.8 }, + 'Junk Text': + { + 'passingFunc': ignoreThis, + 'scale': 0 + }, } passFunc = actionDeterminant[category]['passingFunc'] scale = actionDeterminant[category]['scale'] @@ -245,6 +250,9 @@ def findPassingIndices(starts, ends, numUsers, users, length, passingFunc = eval passersArray[i] = 1 return passersArray +def ignoreThis(percent, TotalNumUsers, scale): + return 'X' + def minPercent(percent, totalNumUsers, scale): if percent>=scale: return 'H' From 8dd6a9bb8e1e09370c1f9c23c2b5f0a43889de31 Mon Sep 17 00:00:00 2001 From: ericwimsatt Date: Sun, 12 Jun 2022 20:29:10 -0700 Subject: [PATCH 2/3] adds texts to s3 import for highlighter; now reproing the issue --- consensus_and_scoring/TriagerScoring.py | 27 +++++++++++++++++++++---- consensus_and_scoring/app.py | 3 +++ consensus_and_scoring/process_dirs.py | 3 ++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/consensus_and_scoring/TriagerScoring.py b/consensus_and_scoring/TriagerScoring.py index 54fd51f..3bafa9c 100644 --- a/consensus_and_scoring/TriagerScoring.py +++ b/consensus_and_scoring/TriagerScoring.py @@ -15,7 +15,7 @@ jpath1 = 'FormTriager1.2C2-2018-07-25T23.json' jpath2 = 'SemanticsTriager1.3C2-2018-07-25T23.json' -def importData(path, out_path): +def importData(path, out_path, texts_dir = None): ''' :param path: location of the triage data @@ -47,6 +47,22 @@ def importData(path, out_path): #flagExclusions = exclusionList(users, flags, cats) flagExclusions = [] #print(flagExclusions) + + #try to handle texts if path is provided: + if texts_dir: + text_file = os.path.join(texts_dir, a + ".txt") + if not(os.path.exists(text_file)): + for root, dir, files in os.walk(text_file): + for file in files: + print(file) + raise Exception("Couldn't find text_file for article {}".format(text_file)) + + if text_file == None: + raise Exception("Couldn't find text_file for article", a) + + with open(text_file, 'r', encoding='utf-8') as file: + source_text = file.read() + if annotator_count >= STRICT_MINIMUM_CONTRIBUTORS: cats = np.unique(art_data['topic_name']) for c in cats: @@ -62,7 +78,8 @@ def importData(path, out_path): texts = cat_data['target_text'].str.decode('unicode-escape').tolist() print('//Article:', a, 'Category:', c, 'numUsers:', numUsers) - source_text = addToSourceText(starts, ends, texts, source_text) + if texts_dir is None: + source_text = addToSourceText(starts, ends, texts, source_text) pstarts, pends, pflags = scoreTriager(starts, ends, users, numUsers, flags, length, c, flagExclusions) out = appendData(filename[0], a, task_uuids, namespaces, pstarts, pends, c, pflags, out, source_text) @@ -388,7 +405,9 @@ def load_args(): if __name__ == '__main__': args = load_args() - input_file = '../data/highlighter/ESTF_HardTriage-2021-05-14T0016-Highlighter.csv' + input_file = '../data/highlighter/DK_off.csv' + texts_dir = '../data/texts/' + if args.input_file: input_file = args.input_file dirname = os.path.dirname(input_file) @@ -398,4 +417,4 @@ def load_args(): output_file = args.output_file print("Input: {}".format(input_file)) print("Output: {}".format(output_file)) - importData(input_file, output_file) + importData(input_file, output_file, texts_dir) diff --git a/consensus_and_scoring/app.py b/consensus_and_scoring/app.py index 08ac9b0..4f7251c 100644 --- a/consensus_and_scoring/app.py +++ b/consensus_and_scoring/app.py @@ -110,6 +110,9 @@ def fetch_tags_files(body, dir_dict): def fetch_highlighter_files(body, dir_dict): highlighters = body.get('Highlighters', []) retrieve_file_list(highlighters, dir_dict['highlighters_dir']) + texts = body.get('Texts', []) + texts = use_article_sha256_filenames(texts) + retrieve_file_list(texts, dir_dict['texts_dir']) logger.info("---FILES RETRIEVED SUCCESSFULLY in request_highlighter_consensus handler---") def fetch_datahunt_files(body, dir_dict): diff --git a/consensus_and_scoring/process_dirs.py b/consensus_and_scoring/process_dirs.py index 9fb6f65..01a29cc 100644 --- a/consensus_and_scoring/process_dirs.py +++ b/consensus_and_scoring/process_dirs.py @@ -33,6 +33,7 @@ def configure_consensus_directories(task_type, parent_dirname): if task_type == "HLTR": dir_dict['highlighters_dir'] = make_dir(parent_dirname, 'highlighters') dir_dict['consensus_dir']= make_dir(parent_dirname, "output_HLTR_consensus") + dir_dict['texts_dir'] = make_dir(parent_dirname, 'texts') clean_output_csvs(dir_dict['consensus_dir']) elif task_type == "QUIZ": dir_dict['config_path'] = './config/' @@ -54,7 +55,7 @@ def generate_highlighter_consensus(dir_dict): if filename.endswith(".csv"): input_file = os.path.join(highlighters_dir, filename) output_file = os.path.join(consensus_dir, "S_IAA_" + filename) - importData(input_file, output_file) + importData(input_file, output_file, dir_dict['texts_dir']) def generate_datahunt_consensus(dir_dict): uuids_to_filter = read_filter_uuids('./data_patches/') From d2a0d3094717d4357aca6436a69f5866a3a22ff0 Mon Sep 17 00:00:00 2001 From: ericwimsatt Date: Mon, 13 Jun 2022 22:15:33 -0700 Subject: [PATCH 3/3] maybe progress? --- consensus_and_scoring/TriagerScoring.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/consensus_and_scoring/TriagerScoring.py b/consensus_and_scoring/TriagerScoring.py index 3bafa9c..e72adfc 100644 --- a/consensus_and_scoring/TriagerScoring.py +++ b/consensus_and_scoring/TriagerScoring.py @@ -75,10 +75,10 @@ def importData(path, out_path, texts_dir = None): namespaces = cat_data['namespace'].tolist() length = floor(cat_data['article_text_length'].tolist()[0]) - texts = cat_data['target_text'].str.decode('unicode-escape').tolist() print('//Article:', a, 'Category:', c, 'numUsers:', numUsers) if texts_dir is None: + texts = cat_data['target_text'].str.decode('unicode-escape').tolist() source_text = addToSourceText(starts, ends, texts, source_text) pstarts, pends, pflags = scoreTriager(starts, ends, users, numUsers, flags, length, c, flagExclusions) out = appendData(filename[0], a, task_uuids, namespaces, pstarts, pends, c, pflags, out, source_text) @@ -95,7 +95,7 @@ def appendData(article_filename, article_sha256, task_uuids, namespaces,start_po case_numbers = np.zeros(len(start_pos_list)) for i in range(len(start_pos_list)): text = getText(start_pos_list[i], end_pos_list[i],source_text) - text = text.encode('unicode-escape').decode('utf-8') + #text = text.encode('unicode-escape').decode('utf-8') #print(len(namespaces), len(start_pos_list), len(end_pos_list), len(case_numbers)) data.append([article_filename, article_sha256, task_uuids[i], namespaces[i], start_pos_list[i], end_pos_list[i], topic_name, int(case_numbers[i]), text]) return data