1
+ import gdown
2
+ import zipfile
3
+
4
+ from os import listdir
5
+ from os .path import isfile , join
6
+ import xml .etree .ElementTree as ET
7
+
8
+ from typing import Tuple , List
9
+
10
+ def main ():
11
+
12
+ url = 'https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3'
13
+ output_path = 'Twitter.zip'
14
+ path_train = 'Data/train/en'
15
+ path_test = 'Data/test/en'
16
+
17
+ data_getter = DataGetter (url , output_path , path_train , path_test )
18
+
19
+ tweet_train , tweet_test = data_getter .get_train_test_docs ()
20
+
21
+ class DataGetter :
22
+ def __init__ (self , url : str , output_path : str , path_train : str , path_test : str ):
23
+ self .url = url
24
+ self .output_path = output_path
25
+ self .path_train = path_train
26
+ self .path_test = path_test
27
+ self .download_zip_data_from_google_drive ()
28
+ self .unzip_data ()
29
+
30
+ def download_zip_data_from_google_drive (self ):
31
+
32
+ gdown .download (self .url , self .output_path , quiet = False )
33
+
34
+ def unzip_data (self ):
35
+
36
+ with zipfile .ZipFile (self .output_path , 'r' ) as zip_ref :
37
+ zip_ref .extractall ('.' )
38
+
39
+ def get_train_test_docs (self ) -> Tuple [list , list ]:
40
+
41
+ tweets_train_files = self .get_files (self .path_train )
42
+ tweets_test_files = self .get_files (self .path_test )
43
+
44
+ t_train = self .extract_texts_from_multiple_files (self .path_train , tweets_train_files )
45
+ t_test = self .extract_texts_from_multiple_files (self .path_test , tweets_test_files )
46
+ return t_train , t_test
47
+
48
+
49
+ @staticmethod
50
+ def get_files (path : str ) -> List [str ]:
51
+
52
+ return [file for file in listdir (path ) if isfile (join (path , file )) and file != "truth.txt" ]
53
+
54
+ @classmethod
55
+ def extract_texts_from_multiple_files (cls , path_to_file : str , files : list ) -> List [str ]:
56
+
57
+ all_docs = []
58
+ for file in files :
59
+ text_in_one_file = cls .extract_texts_from_each_file (path_to_file , file )
60
+ all_docs .append (text_in_one_file )
61
+
62
+ return all_docs
63
+
64
+ @staticmethod
65
+ def extract_texts_from_each_file (path_to_file : str , file_name : list ) -> str :
66
+
67
+ list_of_text_in_one_file = [r .text for r in ET .parse (join (path_to_file , file_name )).getroot ()[0 ]]
68
+ text_in_one_file_as_string = ' ' .join (t for t in list_of_text_in_one_file )
69
+
70
+ return text_in_one_file_as_string
71
+
72
+ if __name__ == '__main__' :
73
+ main ()
0 commit comments