-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfunctions.py
More file actions
154 lines (127 loc) · 5.96 KB
/
functions.py
File metadata and controls
154 lines (127 loc) · 5.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def values_to_col(myDataFrame,myColumnList,bool_with_old_col_name):
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
"""
This function goal is to treat categorical features in a pandas DataFrame
list of columns:
From a categorical column 'CC' which contains 'N' attributes
[att1, att2, att3,..., attn ] we create N new vectors/features/columns :
- row to row, if the category was present at the intersection of 'CC'
and the row,then the value at the intersection of the row and the new
column is 1
- else, the value at the intersection of the row and the new column is 0
The relation between rows and columns is kept
RESULT : The entry DataFrame with the new categorical vectors :
2 new columns are also created :
- 'created_columns' : a column with a list of all the new created columns
- 'dict_mapping' : a column with a dictionary which maps the old columns
with the columns they generated
PARAMS :
- 'myDataFrame' refers to the DataFrame we interest in
- 'myColumnList' refers to the list of columns (the list can have only one
value but it must be a list) we want to vectorize
- 'bool_with_old_col_name' is a boolean attribute that specify if we want to
keep the old columns names or not :
--> example : with old names, new columns are :
CC_att1, CC_att2, CC_att3,..., CC_attn
--> example : without old names : att1, att2, att3,..., attn
"""
created_columns = []
dict_mapping = {}
for column in myColumnList:
#Missing values filling
myDataFrame[column].fillna('none', inplace=True)
newFeatures = []
corpus = myDataFrame[column]
vectorizer = CountVectorizer(min_df=1,max_df=1.0)
#Construction of the row/words Matrix
X = vectorizer.fit_transform(corpus).toarray()
feature_names = vectorizer.get_feature_names()
for feature in feature_names:
if bool_with_old_col_name==True:
newFeatureName = '%s_%s'%(column,feature)
else:
newFeatureName = feature
newFeatures.append(newFeatureName)
created_columns.append(newFeatureName)
if column in dict_mapping :
dict_mapping[column].append(newFeatureName)
else:
dict_mapping[column] = [newFeatureName]
#Construction of the row/words DataFrame
myfeaturedf = pd.DataFrame(X,columns=newFeatures)
myDataFrame = pd.concat([myDataFrame, myfeaturedf], axis=1, join_axes=[myfeaturedf.index])
myDataFrame['created_columns']=[created_columns]*len(myDataFrame)
myDataFrame['dict_mapping']=[dict_mapping]*len(myDataFrame)
return myDataFrame
def percent_of_total(myDataFrame,myColumnList):
"""
This function goal is to convert each continuous columns of a determined
list into a column were the values are the percentage of the sum of all
columns included in the list.
RESULT : The entry DataFrame with columns (included in 'myColumnList')
converted into percentage of their sum.
PARAMS :
- 'myDataFrame' refers to the entry myDataFrame.
- 'myColumnList' refers to the list of columns with which we want to focus
the analysis
"""
myDataFrame['total'] = myDataFrame[myColumnList].sum(1)
for column in myColumnList:
myDataFrame[column] = 100*(myDataFrame[column]/ myDataFrame['total'])
myDataFrame.drop('total',inplace=True,axis=1)
return myDataFrame
def group_by_frequency(myDataFrame,myColumn):
import numpy as np
"""
This function goal is to build an aggregated DataFrame which contains the occurences of the catagorical terms contained in
'myColumn' args.
RESULT : an aggregated DataFrame with the occurences of each values.
- The DataFrame is sorted by descending occurences.
- It also contains :
- rank of each category in terms of occurences.
- cumsum of occurences from the first value to the last one.
- percent of total occurences covered by the upper categories at a given row.
PARAMS :
- 'myDataFrame' : the entry DataFrame
- 'myColumn' : the column concerned by the frequencies count
"""
grouped = myDataFrame.copy()
grouped['occurences'] = 1
grouped = grouped[[myColumn,'occurences']].groupby(myColumn).sum()
grouped.sort_values(by='occurences', ascending=False, inplace=True)
grouped['rank'] = range(1,len(grouped)+1)
grouped['cumsum'] = np.cumsum(grouped['occurences'])
grouped['percent_of_total'] = grouped['cumsum']/grouped['occurences'].sum()
return grouped
def class_my_files(myPath):
"""
This function goal is to build a dictionnary of all the files available in
a given repository, based on the files extensionss.
RESULT : a dictionnary which maps all files to their extensions
PARAMS :
- 'myPath' : the path of the repository in which you want to map files.
"""
from os import listdir
import re
L_files = listdir(myPath)
dict_extensions = {}
extensions = [r'.csv',r'.xls$',r'.xlsx',r'.json',r'.txt',r'.p$','.jpg']
for ext in extensions :
regex = re.compile(ext)
selected_files = list(filter(regex.search, L_files))
clean_ext = re.sub('\.|\$','',ext)
dict_extensions[clean_ext] = selected_files
return dict_extensions
def convert_in_list(myDataFrame,myColumn):
from ast import literal_eval
"""
This function goal is to convert a pandas column into a "list" datatype column
IMPORTANT : The column values must match with the python lists pattern in order to be read and converted correctly.
RESULT : The same column, with each value converted into an array : that's also possible to loop over the array values
PARAMS :
- myDataFrame : the entry DataFrame
- myColumn : String, the column to convert
"""
myDataFrame[myColumn] = myDataFrame[myColumn].apply(literal_eval)
return myDataFrame