reload

DragonflyStats · Oct 25, 2023 · c6d21bd · c6d21bd
1 parent a01e0b7
commit c6d21bd
Show file tree

Hide file tree

Showing 66 changed files with 4,686 additions and 0 deletions.
diff --git a/Coding/janitor/adorn_functions.Rmd b/Coding/janitor/adorn_functions.Rmd
@@ -0,0 +1,10 @@
+#### adorn_ functions
+
+```{r}
+library(janitor)
+```
+
+
+```{r}
+iris %>% adorn_totals()
+```
diff --git a/Coding/janitor/tabyl.Rmd b/Coding/janitor/tabyl.Rmd
@@ -0,0 +1,18 @@
+# NOT RUN {
+tabyl(mtcars, cyl)
+tabyl(mtcars, cyl, gear)
+tabyl(mtcars, cyl, gear, am)
+
+# or using the %>% pipe
+mtcars %>%
+  tabyl(cyl, gear)
+
+# illustrating show_na functionality:
+my_cars <- rbind(mtcars, rep(NA, 11))
+my_cars %>% tabyl(cyl)
+my_cars %>% tabyl(cyl, show_na = FALSE)
+
+# Calling on a single vector not in a data.frame:
+val <- c("hi", "med", "med", "lo")
+tabyl(val)
+# }
diff --git a/Correlation.Rmd b/Correlation.Rmd
@@ -0,0 +1,54 @@
+---
+title: "Correlation"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+## Correlation
+
+A correlation coefficient is a number between -1 and 1 which measures the degree to which two variables are linearly related. 
+If there is perfect linear relationship with positive slope between the two variables, we have a correlation coefficient of 1.
+
+If there is positive correlation, whenever one variable has a high (low) value, so does the other.
+
+If there is a perfect linear relationship with negative slope between the two variables, we have a correlation coefficient of -1; if there is negative correlation, whenever one variable has a high (low) value, the other has a low (high) value.
+
+A correlation coefficient of 0 means that there is no linear relationship between the variables.
+
+We can determine the Pearson Correlation coefficient in R using the <tt>cor()</tt> command.
+
+### Testing Correlation
+
+To get a more complete statistical analysis, with formal tests, we can use the command <tt>cor.test()</tt>
+
+The interpretation of the output from the <tt>cor.test()</tt> procedure is very similar to procedures we have already encountered. 
+The null hypothesis is that the correlation coefficient is equal to zero. This is equivalent to saying that there is no linear relationship between variables.
+
+```{r}
+C=c(0,2,4,6,8,10,12) 
+F=c(2.1,5.0,9.0,12.6,17.3,21.0,24.7)
+cor.test(C,F)
+```
+
+### Spearman and Kendall Correlation
+
+Spearman and Kendall correlations are both ***rank correlations***. 
+To implement Spearman and Kendall correlation, simply specify the type in the <tt>method=" "</tt> argument.
+
+```{r}
+cor(C,F)
+
+```
+
+```{r}
+cor(C,F,method="spearman")
+```
+
+```{r}
+cor(C,F,method="kendall")
+
+```
+
+The interpretation is very similar, but there are no confidence intervals for the estimates.
diff --git a/Creating_Tables_Janitor.Rmd b/Creating_Tables_Janitor.Rmd
@@ -0,0 +1,32 @@
+library(tidyverse)
+library(janitor)
+library(magrittr)
+library(scales)
+
+
+
+DF <- DF <- data.frame(
+  NAME= c("A","B","C","D","E","F"),
+  VALUE = c(54,39,51,38,44,29)
+) 
+
+
+
+DF %>% dplyr::mutate(PCT = VALUE/sum(VALUE),PCT=percent(PCT)) %>% 
+     tidyr::pivot_wider(names_from=NAME,values_from= c("VALUE","PCT")) %>% 
+     dplyr::select(contains("_A"),contains("_B"),contains("_C"),contains("_D"),contains("_E"),contains("_F"))
+
+
+
+DF2 <- data.frame(
+     expand.grid(GRP= c("GRP1","GRP2","GRP3"),NAME= c("A","B","C","D","E","F")),
+     VALUE = sample(3000:8000,18)
+) 
+
+
+DF2 %>% group_by(GRP) %>%
+     mutate(PCT = VALUE/sum(VALUE), PCT=scales::percent(PCT,accuracy = 0.01)) %>%
+     pivot_wider(id_cols="GRP",names_from=NAME,values_from= c("VALUE","PCT")) %>% 
+   select(GRP,contains("_A"),contains("_B"),contains("_C"),contains("_D"),contains("_E"),contains("_F")) %>% 
+     janitor::adorn_totals("both") %>% 
+     mutate_if(is.numeric,scales::comma)
diff --git a/DataVisualization/Diamonds-Data-Visualizations.Rmd b/DataVisualization/Diamonds-Data-Visualizations.Rmd
@@ -0,0 +1,63 @@
+---
+title: "Diamonds Data set"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Draft
+
+### ggplot2 visualizations with the Diamonds Data set
+
+```{r warning=FALSE,echo=FALSE,message=FALSE}
+library(dplyr)
+library(magrittr)
+library(ggplot2)
+
+data(diamonds)
+
+```
+
+
+### Make a Subset of the Data
+
+```{r}
+diamonds2 <- diamonds %>% filter(color %in% c("D","E","F"))
+
+diamonds.report <- diamonds2 %>% 
+  group_by(cut,color)  %>% 
+  summarize(mean.depth = mean(depth)) 
+```
+
+```{r}
+diamonds.report 
+```
+
+```{r}
+p <- ggplot(data=diamonds.report, 
+       aes(x = cut,
+           y = mean.depth, 
+           fill = color)) 
+```
+
+```{r}
+p+ geom_bar(stat="identity",position = "dodge") + 
+         scale_fill_brewer(palette = "Greens" , direction = -1) +
+         ggtitle("Diamonds") + 
+         ylab("Mean Depth") + 
+         theme_bw() +
+         theme(axis.title.x = element_text(color="black", size=14, face="bold"),
+               axis.title.y = element_text(color="black", size=14, face="bold"))
+```
+
+```{r}              
+ p+ geom_bar(stat="identity",position = "stack") + 
+         scale_fill_brewer(palette = "Greens" , direction = -1) +
+         ggtitle("Diamonds") + 
+         ylab("Mean Depth") + 
+         theme_bw() +
+         theme(axis.title.x = element_text(color="black", size=14, face="bold"),
+               axis.title.y = element_text(color="black", size=14, face="bold"))
+```
diff --git a/DataVisualization/Diamonds-Data-Visualizations.html b/DataVisualization/Diamonds-Data-Visualizations.html
diff --git a/ExploratoryDataAnalysis.Rmd → ...Visualization/ExploratoryDataAnalysis.Rmd b/ExploratoryDataAnalysis.Rmd → ...Visualization/ExploratoryDataAnalysis.Rmd
diff --git a/ExploratoryDataAnalysis.html → ...isualization/ExploratoryDataAnalysis.html b/ExploratoryDataAnalysis.html → ...isualization/ExploratoryDataAnalysis.html
diff --git a/...s_files/figure-html/unnamed-chunk-6-1.png → ...s_files/figure-html/unnamed-chunk-6-1.png b/...s_files/figure-html/unnamed-chunk-6-1.png → ...s_files/figure-html/unnamed-chunk-6-1.png
diff --git a/DataVisualization/ggplot2_Smoothing.Rmd b/DataVisualization/ggplot2_Smoothing.Rmd
@@ -0,0 +1,38 @@
+---
+title: "Smoothed Regression Fits with ggplot2"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(ggplot2)
+```
+
+### How to smooth R data in ggplot2
+
+The ggplot2 package also makes it very easy to create regression lines through your data. You use the <tt>stat_smooth()</tt> function to create this type of line.
+
+The interesting thing about <tt>stat_smooth()</tt> is that it makes use of local regression by default. R has several functions that can do this, but ggplot2 uses the loess() function for local regression. 
+
+This means that if you want to create a linear regression model you have to tell <tt>stat_smooth()</tt> to use a different smoother function. You do this with the method argument.
+
+### Unemployment Dataset 
+
+To illustrate the use of a smoother, start by creating a scatterplot of unemployment in the longley dataset:
+
+```{r} 
+ggplot(longley, aes(x=Year, y=Employed)) + geom_point()
+```
+Next, add a smoother. This is as simple as adding stat_smooth() to your line of code.
+
+```{r}
+ggplot(longley, aes(x=Year, y=Employed)) +
+ geom_point() + stat_smooth()
+```
+
+Finally, tell stat_smooth to use a linear regression model. You do this by adding the argument method="lm".
+
+```{r}
+ggplot(longley, aes(x=Year, y=Employed)) +
+  geom_point() + stat_smooth(method="lm")
+```
diff --git a/DataVisualization/ggplot2_Smoothing.html b/DataVisualization/ggplot2_Smoothing.html
diff --git a/...ization/rsconnect/documents/Diamonds-Data-Visualizations.Rmd/rpubs.com/rpubs/Document.dcf b/...ization/rsconnect/documents/Diamonds-Data-Visualizations.Rmd/rpubs.com/rpubs/Document.dcf
@@ -0,0 +1,11 @@
+name: Document
+title:
+username:
+account: rpubs
+server: rpubs.com
+hostUrl: rpubs.com
+appId: https://api.rpubs.com/api/v1/document/1014354/b327ec44354a4db2aa97d7c8609c5719
+bundleId: https://api.rpubs.com/api/v1/document/1014354/b327ec44354a4db2aa97d7c8609c5719
+url: http://rpubs.com/publish/claim/1014354/6b8aada335754a4cacd9cd8e98bfc583
+when: 1678620509.74095
+lastSyncTime: 1678620509.74095
diff --git a/...etteplot.Rmd/rpubs.com/rpubs/Document.dcf → ...etteplot.Rmd/rpubs.com/rpubs/Document.dcf b/...etteplot.Rmd/rpubs.com/rpubs/Document.dcf → ...etteplot.Rmd/rpubs.com/rpubs/Document.dcf
diff --git a/DataVisualization/rsconnect/documents/ggplot2_Smoothing.Rmd/rpubs.com/rpubs/Document.dcf b/DataVisualization/rsconnect/documents/ggplot2_Smoothing.Rmd/rpubs.com/rpubs/Document.dcf
@@ -0,0 +1,11 @@
+name: Document
+title:
+username:
+account: rpubs
+server: rpubs.com
+hostUrl: rpubs.com
+appId: https://api.rpubs.com/api/v1/document/1016200/ed480e95c6e449bba88b9d9c68aef3ac
+bundleId: https://api.rpubs.com/api/v1/document/1016200/ed480e95c6e449bba88b9d9c68aef3ac
+url: http://rpubs.com/publish/claim/1016200/7bf270c4c6a94c5e8a8984bfdb26aab8
+when: 1678989461.68381
+lastSyncTime: 1678989461.68381
diff --git a/GitHub - Shortcut.lnk b/GitHub - Shortcut.lnk
diff --git a/Homicides.R b/Homicides.R
@@ -0,0 +1,42 @@
+## Step 1 : read in data set
+## Use readLines() function
+homicides <- readLines("homicides.txt");
+
+## How many cases?
+length(homicides)
+
+## Lets Look at first case
+homicides[1]
+nchar(homicides[1])
+
+## Lets break it up into readable chunks
+
+substr(homicides[1],1,70)
+substr(homicides[1],71,140)
+substr(homicides[1],141,210)
+substr(homicides[1],211,nchar(homicides[1]))
+
+
+##########################################
+
+# Key piece of information comes at the end.
+# "Cause: ......."
+# > substr(homicides[1],241,nchar(homicides[1]))
+# [1] "</dd><dd>Cause: shooting</dd></dl>'"
+
+##########################################
+
+# Some cases dont list causes of death
+# Which ones?
+
+SetA=1:length(homicides)  # Set of all case numbers
+SetB=grep("Cause",homicides) # Set of cases with "Cause" listed
+
+setdiff(SetA,SetB)
+
+#  No Cause of death: 212 213 236 238 515
+#  Advise reading these cases to make sure.
+
+# > homicides[515]
+# [1] "\t\t\t\t\t\t"
+grep(homicides,"Cause")
diff --git a/Logistic-Regression-South-African-Heart-Disease.Rmd b/Logistic-Regression-South-African-Heart-Disease.Rmd
@@ -0,0 +1,89 @@
+---
+title: "Untitled"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+### South Africa Heart Disease Data Example
+
+A retrospective sample of males in a heart-disease high-risk region
+	of the Western Cape, South Africa. There are roughly two controls per
+	case of CHD. Many of the CHD positive men have undergone blood
+	pressure reduction treatment and other programs to reduce their risk
+	factors after their CHD event. In some cases the measurements were
+	made after these treatments.
+**These data are taken from a larger
+	dataset, described in  Rousseauw et al, 1983, South African Medical
+	Journal. **
+
+---------------------------------
+
+Load the South Africa Heart Disease Data and create training and test sets with
+the following code:
+
+```{r}
+# install.packages("ElemStatLearn")
+library(ElemStatLearn)
+data(SAheart)
+```
+
+```{r}
+head(SAheart)
+```
+
+%	set.seed(8484)
+%	train = sample(1:dim(SAheart)[1],
+%	size=dim(SAheart)[1]/2,replace=F)
+%	trainSA = SAheart[train,]
+%	testSA = SAheart[-train,]
+#
+
+\textbf{Exercise}
+Fit a logistic regression model with
+
+	\item \textit{Coronary Heart Disease} (\texttt{chd}) as the
+	dependent variable
+
+	\item \textit{age at onset, current alcohol consumption, obesity levels,
+		cumulative tabacco, type-A behavior}, and \textit{low density lipoprotein cholesterol} as predictor variables. 
+
+
+	\begin{verbatim}
+	> head(SAheart)
+	sbp tobacco  ldl adiposity famhist typea obesity alcohol age chd
+	1 160   12.00 5.73     23.11 Present    49   25.30   97.20  52   1
+	2 144    0.01 4.41     28.61  Absent    55   28.87    2.06  63   1
+	3 118    0.08 3.48     32.28 Present    52   29.14    3.81  46   0
+	4 170    7.50 6.41     38.03 Present    51   31.99   24.26  58   1
+	5 134   13.60 3.50     27.78 Present    60   25.99   57.34  49   1
+	6 132    6.20 6.47     36.21 Present    62   30.77   14.14  45   0
+	...
+	...
+	\end{verbatim}
+
+}
+
+Calculate the misclassification rate for your model using this model function and a prediction on the "response" scale:
+
+What is the misclassification rate on the training set? What is the misclassification rate on the test set?
+\begin{framed}
+	\begin{verbatim}
+	head(SAheart)
+
+	lr1 <- glm(chd ~ age + alcohol + obesity + 
+	tobacco + typea + ldl, data=trainSA, 
+	family="binomial")
+
+	lr1.train.predict <- predict(lr1, type="response")
+
+	missclass.lr1.train <- missClass(trainSA$chd, 
+	lr1.train.predict)
+
+	lr1.test.predict <- predict(lr1, newdata=testSA, 
+	type="response")
+
+	missclass.lr1.test <- missClass(testSA$chd, 
+	lr1.test.predict)