-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCapitalOneScript.py
175 lines (126 loc) · 5.12 KB
/
CapitalOneScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# Matt Grant
# 9/28/2105
# Data Analysis: Capital One, Use transaction data to categorize clients
import pandas as pd
import numpy as np
from datetime import datetime
from pylab import *
#get_ipython().magic(u'matplotlib inline')
style.use('ggplot')
#Using the transactions data attached below, write a script in
#Python,that outputs a list of subscription IDs,their subscription
#type (daily, monthly, yearly, one-off), and the duration of their
#subscription.
# Subscription ID, Type , Duration
# 3159 , Monthly, 900 days
#Bounus
#1. Give annual revenue numbers for all years between 1966 and 2014.
#Which years had the highest revenue growth, and highest revenue loss?
#2. Predict annual revenue for year 2015 (based on historical retention and new subscribers)
#Import data into pandas dataframe
data = pd.read_csv('../Data Science/CapitalOneScript/subscription_report.csv')
# Explore data
#data.head()
#data.shape
# Set Transaction Date as time data type
# YYYY - MM - DD
data['Transaction Date'] = pd.to_datetime(data['Transaction Date'])
# Verify correct data type
data['Transaction Date'].dtype
# Look for any null values
data.isnull().sum()
#Distribution of Subscriber for # of transactions(min:1 (one-off), max 91)
data['Subscription ID'].value_counts().plot(kind='hist', bins=20, )
ylabel('Subscribers')
xlabel('Number of Transactions')
title('Distribution of Subscriber per # of transactions')
# Sort data by 'Subscription ID' and date & re-index
data.sort(['Subscription ID','Transaction Date'], inplace=True)
data.index = range(1,len(data) + 1)
#Create a column for the year in which the transaction occured
data['Year'] = [datetime.year for datetime in data['Transaction Date']]
#Create a list of Subscription IDs 27,609 Unique IDs
IDs = [data['Subscription ID'].unique()]
#print IDs, len(IDs[0])
#Create dataframe for results
results = pd.DataFrame(columns =['Subscription ID','Duration','Type',])
# Initailize list to store data
idCnt = len(IDs)
IDCol = [0]*idCnt
nrowCol = [0]*idCnt
deltaTcol=[0]*idCnt
durationCol=[0]*idCnt
TypeCol=[0]*idCnt
# loop through all subscription IDs and pull out ID,#transactions,Duration(max-min),Type(monthly..)
for item in range(idCnt): #27609 elements
IDdata = data[data['Subscription ID'] == IDs[0][item]]
ID = IDdata['Subscription ID'].iloc[0]#IDs[0][item]
nrow = IDdata.shape[0]
if nrow > 1:
deltaT = IDdata['Transaction Date'].iloc[nrow-1] - IDdata['Transaction Date'].iloc[0]
duration = deltaT/nrow
else:
deltaT = pd.to_timedelta('0 days')
duration = pd.to_timedelta('0 days')
tType = 'One Off'
if duration > pd.to_timedelta('250 days'):
tType = 'Yearly'
elif duration >= pd.to_timedelta('27 days') and duration <= pd.to_timedelta('32 days'):
tType = 'Monthly'
elif duration > pd.to_timedelta('0 days') and duration < pd.to_timedelta('2 days'):
tType = 'Daily'
# Save data to lists
IDCol[item] = ID
nrowCol[item] = nrow
deltaTcol[item]= deltaT
durationCol[item]= duration
TypeCol[item]= tType
#print ID, nrow, deltaT, duration, tType
print 'complete'
#Save data to dataframe
results['Subscription ID'] = IDCol
results['Duration'] = deltaTcol
results['Type'] = TypeCol
#Output to csv file
results.to_csv("CapitalOneTransactions.csv", index=False, cols=('ID','Type','Duration(Days)'))
# Total revenue by year in $
revenue_by_year = data['Amount (USD)'].groupby(data['Year']).sum()#
print 'Revenue by year',revenue_by_year
#plot revenue over time
revenue_by_year.plot(title='Revenue Over Time ($100 Million)')
rcParams['figure.figsize'] = (10, 6)
# Bar chart of revenue change per year
revenue_by_year.pct_change().plot(kind='bar',title='Percent Revenue Change Per Year')
rcParams['figure.figsize'] = (10, 6)
# generates percent change and orders series from highest revenue loss to highest revenue growth
# 2014 - greatest revenue loss (-58.0%)
# 1967 - greatest revenue growth (51.5%)
print 'Greatest Revenue Growth:',revenue_by_year.pct_change().idxmax()
print 'Greatest Revenue Loss:', revenue_by_year.pct_change().idxmin()
revenue_by_year.pct_change().order()
# Total Number of Subscribers per year
subscriberCnt = data['Subscription ID'].groupby(data['Year']).nunique()
# Total number of transactions per year
transactionCnt = data['Subscription ID'].groupby(data['Year']).count()
df = pd.concat([subscriberCnt, transactionCnt,revenue_by_year], axis=1).reset_index()
df.columns = ['Year',"subscriberCnt","transactionCnt","revenue(USD)"]
df.head()
#Split data into train and test sets
y = df.pop('revenue(USD)')
X = df
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
#create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training set
regr.fit(X_train,y_train)
# Coefficients
regr.coef_
# mean squared error
np.mean((regr.predict(X_test) - y_test) ** 2)
# variance score: 1 is perfect prediction
regr.score(X_test, y_test)
plt.scatter(X_test,y_test,color='black')
plt.plot(X_test,regr.predict(X_test),color='blue')
plt.xticks(())
plt.yticks(())
plt.show()