diff --git a/blogContent/posts/data-science/html/IntroToR.html b/blogContent/posts/data-science/html/IntroToR.html new file mode 100644 index 0000000..8509946 --- /dev/null +++ b/blogContent/posts/data-science/html/IntroToR.html @@ -0,0 +1,1293 @@ + + + + + + Introduction to R + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+

Introduction to R

Jeffery Russell
9-30-19

+ +
+

RIT Scientific Computing Group

+ +

R

+ +
+ +
+
+

History

+
+
    +
  • Back in the day (1976) Bell Laboratories created the S statistical programming language
  • +
  • People were sad because it was exclusively licensed by AT&T
  • +
  • During the 90's a group of people developed a S replacement called R and it was licensed under GNU
  • +
+ +

+ +
+ +
+
+

Why use R

+
+
    +
  • Statistics and data analysis
  • +
  • Machine learning
  • +
  • Fast prototyping
  • +
  • Creating graphs
  • +
  • Writing research papers and reports
  • +
+ +
+ +
+
+

R Markdown

+
+

Using R markdown you can easily create reports and presentations by embedding your code in the report. +This has major advantages.

+ +
    +
  • Easier to reproduce other people's research
  • +
  • Easy to re-run your analysis if the data changes – no more re-generating 10 charts and re-computing statistical tests
  • +
  • Easy to export to other formats
  • +
+ +
+ +
+
+

Embedding Code Output in a Document

+
+
summary(cars)
+
+ +
     speed           dist       
+ Min.   : 4.0   Min.   :  2.00  
+ 1st Qu.:12.0   1st Qu.: 26.00  
+ Median :15.0   Median : 36.00  
+ Mean   :15.4   Mean   : 42.98  
+ 3rd Qu.:19.0   3rd Qu.: 56.00  
+ Max.   :25.0   Max.   :120.00  
+
+ +
+ +
+
+

Embedding Graphs in a Document

+
+
plot(mtcars$wt, mtcars$mpg, main="Weight vs MPG", xlab = "weight", ylab="MPG")
+
+ +

plot of chunk unnamed-chunk-2

+ +
+ +
+
+

Syntax

+
+
    +
  • Syntax of R is C-esk with its use of curly braces
  • +
  • Variables are similar to python since it infers your data type used
  • +
  • Type system is rather weird, the base unit for everything is a vector– even integers
  • +
+ +
x <- 0
+if (x < 0) 
+{
+  print("Negative number")
+} else if (x > 0) 
+{
+  print("Positive number")
+} else
+{
+  print("Zero")
+}
+
+ +
[1] "Zero"
+
+ +
+ +
+
+

Syntax: For Loops

+
+
for(i in 1:5)
+{
+  print(i)
+}
+
+ +
[1] 1
+[1] 2
+[1] 3
+[1] 4
+[1] 5
+
+ +
for(i in (1:5)*2)
+{
+  print(i)
+}
+
+ +
[1] 2
+[1] 4
+[1] 6
+[1] 8
+[1] 10
+
+ +
+ +
+
+

Syntax: While Loops

+
+
x <- 2
+
+while(x == 2)
+{
+  print("Stonks")
+  x = x + 1
+}
+
+ +
[1] "Stonks"
+
+ +
+ +
+
+

Syntax cont

+
+

+ +
+ +
+
+

Syntax: Arrays

+
+

Arrays are 1 indexed.

+ +
for(i in c(1,4,5))
+{
+  print(i)
+}
+
+ +
[1] 1
+[1] 4
+[1] 5
+
+ +
ar <- c(1,3,9)
+
+print(ar[2])
+
+ +
[1] 3
+
+ +
+ +
+
+

Getting Help

+
+

Using the built in help command, you can view documentation for any function.

+ +
help(plot)
+
+ +
+ +
+
+

Making Graphs

+
+
plot(x=1:10, y=(1:10)^2, xlab = "x", ylab = "y", main="Ez")
+
+ +

plot of chunk unnamed-chunk-10

+ +
+ +
+
+

ML Example pt: 1

+
+
ggplot(data = iris) + 
+  theme(plot.title = element_text(hjust = 0.5)) + 
+  ggtitle("Iris Flower Set") + 
+  geom_point(mapping = aes(x=Sepal.Length, y=Petal.Length, color = Species)) + 
+  labs(x = "Sepal Length", y = "Petal Length", color="Phase") +
+  theme_bw()
+
+ +

plot of chunk unnamed-chunk-11

+ +
+ +
+
+

ML Example pt: 2

+
+
ggplot(data = iris) + theme(plot.title = element_text(hjust = 0.5)) + 
+  ggtitle("Iris Flow Length") + 
+  geom_boxplot(mapping = aes(y=Petal.Length, x = Species), outlier.colour = "red", outlier.shape = 1) + 
+  labs(x = "Flower Type", y = "Petal Length") +
+  coord_flip() +
+  theme_bw()
+
+ +

plot of chunk unnamed-chunk-12

+ +
+ +
+
+

Super Cool ML Example pt: 3

+
+
sc <- spark_connect(master = "local")
+
+iris_tbl <- sdf_copy_to(sc, iris, name = "iris_tbl", overwrite = TRUE)
+
+partitions <- iris_tbl %>%
+  sdf_partition(training = 0.7, test = 0.3, seed = 1111)
+
+iris_training <- partitions$training
+iris_test <- partitions$test
+
+dt_model <- iris_training %>%
+  ml_decision_tree(Species ~ .)
+
+pred <- ml_predict(dt_model, iris_test)
+
+ml_multiclass_classification_evaluator(pred)
+
+ +
[1] 0.9451737
+
+ +
+ +
+
+

Resources

+ + +
+
+

Questions?

+
+

+ +
+ +
+
+

Workshop

+
+

Visualize the built in “mpg” data-set from the tidyverse library.

+ +

Step 1: Install Tidyverse package in R and include it in your R Script

+ +
# Install tidyverse
+install.packages("tidyverse")
+
+# Include tidyverse in project
+
+library(tidyverse)
+
+ +

+ +
+ +
+
+

Engine Size Vs MPG Normal Plot

+
+
plot(x = mpg$displ, y=mpg$hwy, main="Engine Size(Liters) vs MPG")
+
+ +

plot of chunk unnamed-chunk-14

+ +
+ +
+
+

Engine Size Vs MPG

+
+
ggplot(data = mpg) + geom_point(mapping=aes(x=displ, y = hwy))
+
+ +

plot of chunk unnamed-chunk-15

+ +
+ +
+
+

Engine Size Vs MPG with color Mappings

+
+
ggplot(data = mpg) + geom_point(mapping=aes(x=displ, y = hwy, color=class))
+
+ +

plot of chunk unnamed-chunk-16

+ +
+ +
+
+

Engine Size Vs MPG with shape Mappings

+
+
ggplot(data = mpg) + geom_point(mapping=aes(x=displ, y = hwy, shape=class))
+
+ +

plot of chunk unnamed-chunk-17

+ +
+ +
+
+

Engine Size Vs MPG For each Class

+
+
ggplot(data = mpg) + geom_point(mapping=aes(x=displ, y = hwy)) + facet_wrap(~ class, nrow=3)
+
+ +

plot of chunk unnamed-chunk-18

+ +
+ +
+
+

Engine Size Vs MPG With Line of Best Fit

+
+
ggplot(data = mpg) + 
+  geom_point(aes(x=displ, y = hwy, color=class))+
+  geom_smooth(aes(x=displ, y = hwy))
+
+ +

plot of chunk unnamed-chunk-19

+ +
+ +
+
+

Engine Size vs MPG With Line of Best Fit

+
+
ggplot(data = mpg, aes(x=displ, y = hwy, color=class)) + 
+  geom_point()+
+  geom_smooth()
+
+ +

plot of chunk unnamed-chunk-20

+ +
+ +
+ + +
+
+ + + + + + + + + +