-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPythonLab10.py
29 lines (22 loc) · 856 Bytes
/
PythonLab10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import argparse
import numpy as np
def main(documentsTxt):
documents = documentsTxt.strip().split('\n')
unique_words = set()
for doc in documents:
unique_words.update(set(doc.lower().split()))
word_to_index = {word: i for i, word in enumerate(sorted(unique_words))}
n_docs = len(documents)
n_words = len(unique_words)
final_matrix = [[0]*n_words for i in range(n_docs)]
for i, doc in enumerate(documents):
for word in doc.lower().split():
final_matrix[i][word_to_index[word]] += 1
print('# Features:')
for row in final_matrix:
print(row)
if __name__ == "__main__":
parser = argparse.ArgumentParser("One Hot Encoder")
parser.add_argument("--fpath", type=str, help="Name of the txt file to be read in")
args = parser.parse_args()
main(open(args.fpath).read())