-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_cleaner.py
More file actions
40 lines (32 loc) · 1.83 KB
/
data_cleaner.py
File metadata and controls
40 lines (32 loc) · 1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# This code is used for cleaning common words and other biased words from the word_freq.json file created by analyzer
import json
dir = "word_freq.json"
with open(dir, "r") as f:
word_freq = json.load(f)
# Exclude common words
dir = "1000_common_words.json"
with open(dir, "r") as f:
common_words = json.load(f)
# More common words
extended_common_words = ["also", "time", "score", "view", "animal", "reason", "upper"
, "plus", "yellow", "adult", "purple", "row", "closed", "video",
"rabbit", "thousand", "international", "mirror", "wall", "online",
"plan", "southern", "go", "bootstrap", "northern"]
# Specific field-related words. Like medical terms.
biased_words = ["phenotype", "substrate", "serum", "amino", "temporal",
"lysis", "enzyme", "protease", "peptide", "nucleotide",
"bioscience", "canonical", "chromatography", "spectre"
, "allele", "genotype", "python", "glycerol", "antigen",
"polymarase", "chromatin", 'immunofluorescence', "ligation",
"mycoplasma", "ligand", "transduce", "antigen", "methanol",
"atlas", "lipid", "spectroscopy", "mutagenesis", "monoclonal",
"epithelial", "embryonic", "hallmark", "chimera", "kinase",
"perfusion", "dendrite", "phenotypic", "cortical", "formic",
"synaptic", "enzymatic", "trypsin", "fibroblast"]
keys = list(word_freq.keys())
for key in keys:
if key in common_words + extended_common_words + biased_words:
del word_freq[key]
output_file = "word_freq(cleaned).json"
with open(output_file, 'w') as f:
json.dump(word_freq, f, indent = 4)