Skip to content

Commit 9cb267c

Browse files
good functions example
1 parent e6276f9 commit 9cb267c

File tree

10 files changed

+242
-35
lines changed

10 files changed

+242
-35
lines changed

.flake8

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[flake8]
2+
ignore = E203, E266, E501, W503, F403, F401, E402
3+
max-line-length = 89
4+
max-complexity = 18
5+
select = B,C,E,F,W,T4,B9

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ dp*
77
mallet-2.0.8
88
.benchmarks
99
wandb
10+
*.pkl
11+
*.zip
12+
1013

1114
# VSCode workspace
1215
*-workspace

.isort.cfg

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
[settings]
2+
line_length = 88
3+
multi_line_output = 3
4+
include_trailing_comma = True
5+
known_third_party = celery,django,environ,pyquery,pytz,redis,requests,rest_framework

.pre-commit-config.yaml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# See https://pre-commit.com for more information
2+
# See https://pre-commit.com/hooks.html for more hooks
3+
repos:
4+
- repo: https://github.com/ambv/black
5+
rev: stable
6+
hooks:
7+
- id: black
8+
language_version: python3.7
9+
- repo: https://github.com/pre-commit/pre-commit-hooks
10+
rev: v2.0.0
11+
hooks:
12+
- id: flake8
13+
- repo: https://github.com/timothycrosley/isort
14+
rev: 4.3.21
15+
hooks:
16+
- id: isort

pyproject.toml

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[tool.black]
2+
py36 = true
3+
include = '\.pyi?$'
4+
exclude = '''
5+
/(
6+
\.git
7+
| \.hg
8+
| \.mypy_cache
9+
| \.tox
10+
| \.venv
11+
| _build
12+
| buck-out
13+
| build
14+
| dist
15+
16+
# The following are specific to Black, you probably don't want those.
17+
| blib2to3
18+
| tests/data
19+
)/

python/good_functions/.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Data
2+
Twitter.zip

python/good_functions/bad_code.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import gdown
2+
import zipfile
3+
4+
from os import listdir
5+
from os.path import isfile, join
6+
import xml.etree.ElementTree as ET
7+
8+
def main():
9+
10+
load_data(url='https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3',
11+
output='Twitter.zip',
12+
path_train='Data/train/en',
13+
path_test='Data/test/en')
14+
15+
16+
def load_data(url: str, output: str, path_train: str, path_test: str):
17+
18+
# Download data from Google Drive
19+
output = 'Twitter.zip'
20+
gdown.download(url, output, quiet=False)
21+
22+
# Unzip data
23+
with zipfile.ZipFile(output, 'r') as zip_ref:
24+
zip_ref.extractall('.')
25+
26+
# Get train, test data files
27+
tweets_train_files = [file for file in listdir(path_train) if isfile(join(path_train, file)) and file != "truth.txt"]
28+
tweets_test_files = [file for file in listdir(path_test) if isfile(join(path_test, file)) and file != "truth.txt"]
29+
30+
# Extract texts from each file
31+
t_train = []
32+
for file in tweets_train_files:
33+
train_doc_1 =[r.text for r in ET.parse(join(path_train, file)).getroot()[0]]
34+
t_train.append(' '.join(t for t in train_doc_1))
35+
36+
37+
t_test = []
38+
for file in tweets_test_files:
39+
test_doc_1 =[r.text for r in ET.parse(join(path_test, file)).getroot()[0]]
40+
t_test.append(' '.join(t for t in test_doc_1))
41+
42+
43+
return t_train, t_test
44+
45+
if __name__=='__main__':
46+
main()

python/good_functions/good_code.py

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import gdown
2+
import zipfile
3+
4+
from os import listdir
5+
from os.path import isfile, join
6+
import xml.etree.ElementTree as ET
7+
8+
from typing import Tuple, List
9+
10+
def main():
11+
12+
url = 'https://drive.google.com/uc?id=1jI1cmxqnwsmC-vbl8dNY6b4aNBtBbKy3'
13+
output_path = 'Twitter.zip'
14+
path_train = 'Data/train/en'
15+
path_test = 'Data/test/en'
16+
17+
data_getter = DataGetter(url, output_path, path_train, path_test)
18+
19+
tweet_train, tweet_test = data_getter.get_train_test_docs()
20+
21+
class DataGetter:
22+
def __init__(self, url: str, output_path: str, path_train: str, path_test: str):
23+
self.url = url
24+
self.output_path = output_path
25+
self.path_train = path_train
26+
self.path_test = path_test
27+
self.download_zip_data_from_google_drive()
28+
self.unzip_data()
29+
30+
def download_zip_data_from_google_drive(self):
31+
32+
gdown.download(self.url, self.output_path, quiet=False)
33+
34+
def unzip_data(self):
35+
36+
with zipfile.ZipFile(self.output_path, 'r') as zip_ref:
37+
zip_ref.extractall('.')
38+
39+
def get_train_test_docs(self) -> Tuple[list, list]:
40+
41+
tweets_train_files = self.get_files(self.path_train)
42+
tweets_test_files = self.get_files(self.path_test)
43+
44+
t_train = self.extract_texts_from_multiple_files(self.path_train, tweets_train_files)
45+
t_test = self.extract_texts_from_multiple_files(self.path_test, tweets_test_files)
46+
return t_train, t_test
47+
48+
49+
@staticmethod
50+
def get_files(path: str) -> List[str]:
51+
52+
return [file for file in listdir(path) if isfile(join(path, file)) and file != "truth.txt"]
53+
54+
@classmethod
55+
def extract_texts_from_multiple_files(cls, path_to_file: str, files: list) -> List[str]:
56+
57+
all_docs = []
58+
for file in files:
59+
text_in_one_file = cls.extract_texts_from_each_file(path_to_file, file)
60+
all_docs.append(text_in_one_file)
61+
62+
return all_docs
63+
64+
@staticmethod
65+
def extract_texts_from_each_file(path_to_file: str, file_name: list) -> str:
66+
67+
list_of_text_in_one_file =[r.text for r in ET.parse(join(path_to_file, file_name)).getroot()[0]]
68+
text_in_one_file_as_string = ' '.join(t for t in list_of_text_in_one_file)
69+
70+
return text_in_one_file_as_string
71+
72+
if __name__=='__main__':
73+
main()

visualization/github/test_df.zip

-23.3 MB
Binary file not shown.

0 commit comments

Comments
 (0)