good functions example

khuyentran1401 · khuyentran1401 · commit 9cb267cff810 · 2021-01-20T07:18:30.000-06:00
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E203, E266, E501, W503, F403, F401, E402
+max-line-length = 89
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,9 @@ dp*
 mallet-2.0.8
 .benchmarks
 wandb
+*.pkl
+*.zip
+
 
 # VSCode workspace
 *-workspace
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,5 @@
+[settings]
+line_length = 88
+multi_line_output = 3
+include_trailing_comma = True
+known_third_party = celery,django,environ,pyquery,pytz,redis,requests,rest_framework
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,16 @@
+  # See https://pre-commit.com for more information
+  # See https://pre-commit.com/hooks.html for more hooks
+  repos:
+    - repo: https://github.com/ambv/black
+      rev: stable
+      hooks:
+        - id: black
+          language_version: python3.7
+    -   repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v2.0.0
+        hooks:
+        - id: flake8
+    -   repo: https://github.com/timothycrosley/isort
+        rev: 4.3.21
+        hooks:
+        -   id: isort
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,19 @@
+[tool.black]
+    py36 = true
+    include = '\.pyi?$'
+    exclude = '''
+    /(
+        \.git
+      | \.hg
+      | \.mypy_cache
+      | \.tox
+      | \.venv
+      | _build
+      | buck-out
+      | build
+      | dist
+
+      # The following are specific to Black, you probably don't want those.
+      | blib2to3
+      | tests/data
+    )/
diff --git a/python/good_functions/.gitignore b/python/good_functions/.gitignore
@@ -0,0 +1,2 @@
+Data
+Twitter.zip
diff --git a/python/good_functions/bad_code.py b/python/good_functions/bad_code.py
@@ -0,0 +1,46 @@
+import gdown
+import zipfile
+
+from os import listdir
+from os.path import isfile, join
+import xml.etree.ElementTree as ET
+
+def main():
+
+    load_data(url='https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3',
+            output='Twitter.zip',
+            path_train='Data/train/en',
+            path_test='Data/test/en')
+
+
+def load_data(url: str, output: str, path_train: str, path_test: str):
+
+    # Download data from Google Drive
+    output = 'Twitter.zip'
+    gdown.download(url, output, quiet=False) 
+
+    # Unzip data
+    with zipfile.ZipFile(output, 'r') as zip_ref:
+        zip_ref.extractall('.')
+
+    # Get train, test data files
+    tweets_train_files = [file for file in listdir(path_train) if isfile(join(path_train, file)) and file != "truth.txt"]
+    tweets_test_files = [file for file in listdir(path_test) if isfile(join(path_test, file)) and file != "truth.txt"]
+
+    # Extract texts from each file
+    t_train = []
+    for file in tweets_train_files:
+        train_doc_1 =[r.text for r in ET.parse(join(path_train, file)).getroot()[0]]
+        t_train.append(' '.join(t for t in train_doc_1))
+    
+    
+    t_test = []
+    for file in tweets_test_files:
+        test_doc_1 =[r.text for r in ET.parse(join(path_test, file)).getroot()[0]]
+        t_test.append(' '.join(t for t in test_doc_1))
+    
+
+    return t_train, t_test
+
+if __name__=='__main__':
+    main()
diff --git a/python/good_functions/good_code.py b/python/good_functions/good_code.py
@@ -0,0 +1,73 @@
+import gdown
+import zipfile
+
+from os import listdir
+from os.path import isfile, join
+import xml.etree.ElementTree as ET
+
+from typing import Tuple, List
+
+def main():
+
+    url = 'https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3'
+    output_path = 'Twitter.zip'
+    path_train = 'Data/train/en'
+    path_test = 'Data/test/en'
+
+    data_getter = DataGetter(url, output_path, path_train, path_test)
+
+    tweet_train, tweet_test = data_getter.get_train_test_docs()
+
+class DataGetter:
+    def __init__(self, url: str, output_path: str, path_train: str, path_test: str):
+        self.url = url 
+        self.output_path = output_path
+        self.path_train = path_train
+        self.path_test = path_test
+        self.download_zip_data_from_google_drive()
+        self.unzip_data()
+
+    def download_zip_data_from_google_drive(self):
+        
+        gdown.download(self.url, self.output_path, quiet=False) 
+
+    def unzip_data(self):
+
+        with zipfile.ZipFile(self.output_path, 'r') as zip_ref:
+            zip_ref.extractall('.')
+
+    def get_train_test_docs(self) -> Tuple[list, list]:
+
+        tweets_train_files = self.get_files(self.path_train)
+        tweets_test_files = self.get_files(self.path_test)
+
+        t_train = self.extract_texts_from_multiple_files(self.path_train, tweets_train_files)
+        t_test  = self.extract_texts_from_multiple_files(self.path_test, tweets_test_files)
+        return t_train, t_test
+
+
+    @staticmethod
+    def get_files(path: str) -> List[str]:
+
+        return [file for file in listdir(path) if isfile(join(path, file)) and file != "truth.txt"]
+
+    @classmethod
+    def extract_texts_from_multiple_files(cls, path_to_file: str, files: list) -> List[str]:
+
+        all_docs = []
+        for file in files:
+            text_in_one_file = cls.extract_texts_from_each_file(path_to_file, file)
+            all_docs.append(text_in_one_file)
+
+        return all_docs
+
+    @staticmethod
+    def extract_texts_from_each_file(path_to_file: str, file_name: list) -> str:
+        
+        list_of_text_in_one_file =[r.text for r in ET.parse(join(path_to_file, file_name)).getroot()[0]]
+        text_in_one_file_as_string = ' '.join(t for t in list_of_text_in_one_file)
+        
+        return text_in_one_file_as_string
+
+if __name__=='__main__':
+    main()
diff --git a/visualization/github/test_df.zip b/visualization/github/test_df.zip
diff --git a/visualization/scattertext/visualization.ipynb b/visualization/scattertext/visualization.ipynb