jseabold
diff --git a/Diff for: ‎contrasts.ipynb
+79-387 b/Diff for: ‎contrasts.ipynb
+79-387
diff --git a/Diff for: ‎contrasts.py
+28-14 b/Diff for: ‎contrasts.py
+28-14
@@ -5,6 +5,10 @@
 
 # Contrasts Overview
 
+# <codecell>
+
+import statsmodels.api as sm
+
 # <markdowncell>
 
 # This document is based heavily on this excellent resource from UCLA http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm
@@ -24,9 +28,13 @@
 # <codecell>
 
 import pandas
-url = 'http://www.ats.ucla.edu/stat/R/notes/hsb2_nolabel.csv'
+url = 'http://www.ats.ucla.edu/stat/data/hsb2.csv'
 hsb2 = pandas.read_table(url, delimiter=",")
 
+# <codecell>
+
+hsb2.head(10)
+
 # <rawcell>
 
 # It will be instructive to look at the mean of the dependent variable, write, for each level of race ((1 = Hispanic, 2 = Asian, 3 = African American and 4 = Caucasian)).
@@ -56,16 +64,24 @@
 
 # <codecell>
 
+hsb2.race.head(10)
+
+# <codecell>
+
 print contrast.matrix[hsb2.race-1, :][:20]
 
+# <codecell>
+
+sm.categorical(hsb2.race.values)
+
 # <rawcell>
 
 # This is a bit of a trick, as the `race` category conveniently maps to zero-based indices. If it does not, this conversion happens under the hood, so this won't work in general but nonetheless is a useful exercise to fix ideas. The below illustrates the output using the three contrasts above
 
 # <codecell>
 
 from statsmodels.formula.api import ols
-mod = ols("write ~ C(race, Treatment)", df=hsb2)
+mod = ols("write ~ C(race, Treatment)", data=hsb2)
 res = mod.fit()
 print res.summary()
 
@@ -106,12 +122,16 @@ def code_without_intercept(self, levels):
 
 # <codecell>
 
+hsb2.groupby('race')['write'].mean().mean()
+
+# <codecell>
+
 contrast = Simple().code_without_intercept(levels)
 print contrast.matrix
 
 # <codecell>
 
-mod = ols("write ~ C(race, Simple)", df=hsb2)
+mod = ols("write ~ C(race, Simple)", data=hsb2)
 res = mod.fit()
 print res.summary()
 
@@ -131,7 +151,7 @@ def code_without_intercept(self, levels):
 
 # <codecell>
 
-mod = ols("write ~ C(race, Sum)", df=hsb2)
+mod = ols("write ~ C(race, Sum)", data=hsb2)
 res = mod.fit()
 print res.summary()
 
@@ -159,7 +179,7 @@ def code_without_intercept(self, levels):
 
 # <codecell>
 
-mod = ols("write ~ C(race, Diff)", df=hsb2)
+mod = ols("write ~ C(race, Diff)", data=hsb2)
 res = mod.fit()
 print res.summary()
 
@@ -187,10 +207,9 @@ def code_without_intercept(self, levels):
 contrast = Helmert().code_without_intercept(levels)
 print contrast.matrix
 
-
 # <codecell>
 
-mod = ols("write ~ C(race, Helmert)", df=hsb2)
+mod = ols("write ~ C(race, Helmert)", data=hsb2)
 res = mod.fit()
 print res.summary()
 
@@ -224,12 +243,7 @@ def code_without_intercept(self, levels):
 
 # <codecell>
 
-_, bins = np.histogram(hsb2.read, 3)
-try: # requires numpy master
-   readcat = np.digitize(hsb2.read, bins, True)
-except:
-   readcat = np.digitize(hsb2.read, bins)
-hsb2['readcat'] = readcat
+hsb2['readcat'] = pandas.cut(hsb2.read, bins=3)
 hsb2.groupby('readcat').mean()['write']
 
 # <codecell>
@@ -241,7 +255,7 @@ def code_without_intercept(self, levels):
 
 # <codecell>
 
-mod = ols("write ~ C(readcat, Poly)", df=hsb2)
+mod = ols("write ~ C(readcat, Poly)", data=hsb2)
 res = mod.fit()
 print res.summary()