This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
plot(cars)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
RStudio is able to simulate the final formatting live, by switching from “Source” to “Visual” in the task bar above.
The file module1-video_reading.csv contains the following data:
participant: unique id for each participant
score_reading: number of points the participant scored in a reading test
hours_video: average number of hours the participant spends watching video stream (TV, movies, ..) each day.
We first have to install all packages that we need for the following tasks
# only run (by uncommenting) if not already installed (comment out again after installation):
#install.packages('tidyverse', dependencies = T)
#install.packages('mlr3verse', dependencies = T)
dat <- read.csv("module1-video_reading.csv")
head(dat)
lm()
function.mdl <- lm(score_reading ~ hours_video, data = dat)
summary(mdl)
##
## Call:
## lm(formula = score_reading ~ hours_video, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.853 -6.666 0.039 6.046 52.424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.6583 1.6863 28.262 <2e-16 ***
## hours_video -0.1258 0.5089 -0.247 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.387 on 98 degrees of freedom
## Multiple R-squared: 0.0006233, Adjusted R-squared: -0.009574
## F-statistic: 0.06112 on 1 and 98 DF, p-value: 0.8053
mlr3verse package. Does your
final model output (applying the summary() function on the
fitted model object) differ from the one in task 3, which was estimated
using base R functionality?library(mlr3verse)
## Loading required package: mlr3
tsk = as_task_regr(score_reading ~ hours_video, data = dat)
mdl = lrn("regr.lm")
mdl$train(tsk)
summary(mdl$model) #same result as with lm()
##
## Call:
## stats::lm(formula = task$formula(), data = task$data())
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.853 -6.666 0.039 6.046 52.424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.6583 1.6863 28.262 <2e-16 ***
## hours_video -0.1258 0.5089 -0.247 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.387 on 98 degrees of freedom
## Multiple R-squared: 0.0006233, Adjusted R-squared: -0.009574
## F-statistic: 0.06112 on 1 and 98 DF, p-value: 0.8053
library(ggplot2)
scatter_plot <- ggplot(dat, aes(x=hours_video, y=score_reading)) +
geom_point()
scatter_plot
scatter_plot <- scatter_plot +
geom_abline(aes(intercept = mdl$model$coefficients[['(Intercept)']], slope = mdl$model$coefficients[['hours_video']]), col = 'blue')
scatter_plot
# Alternative solution:
#scatter_plot <- ggplot(dat, aes(x=hours_video, y=score_reading)) +
# geom_point() +
# geom_smooth(method='lm')
#scatter_plot
dat2 <- dat[dat$participant != 70, ]
# mlr3verse:
task2 = as_task_regr(score_reading ~ hours_video, data = dat2)
mdl2 = lrn("regr.lm")
mdl2$train(task2)
summary(mdl2$model)
##
## Call:
## stats::lm(formula = task$formula(), data = task$data())
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.7277 -6.3708 0.6861 5.4776 13.3032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.8564 1.3909 35.844 < 2e-16 ***
## hours_video -1.1383 0.4324 -2.632 0.00987 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.562 on 97 degrees of freedom
## Multiple R-squared: 0.06667, Adjusted R-squared: 0.05705
## F-statistic: 6.929 on 1 and 97 DF, p-value: 0.009867
# base R:
#mdl2 <- lm(score_reading ~ hours_video, data = dat2)
#summary(mdl2)
# plotting:
scatter_plot <- scatter_plot +
geom_abline(aes(intercept = mdl2$model$coefficients[['(Intercept)']], slope = mdl2$model$coefficients[['hours_video']]), col = 'red')
scatter_plot
predict_newdata() method on the fitted model
object, which behaves similar to the predict() function in
base R)dat3 <- data.frame('hours_video' = quantile(dat2$hours_video, probs = 0.95))
# mlr3verse:
dat3$score_reading <- mdl2$predict_newdata(newdata = dat3)$response
dat3$score_reading
## [1] 43.09493
# base R:
#dat3$score_reading <- predict(mdl2, newdata = dat3)
#dat3$score_reading
scatter_plot <- scatter_plot +
geom_vline(data = dat3, aes(xintercept = hours_video), lty = 2) +
geom_point(data = dat3, color = 'red', size = 5, shape = 'cross')
scatter_plot
# z-standardize data to get standardized regression coefficients
dat2$score_reading_z <- scale(dat2$score_reading)
dat2$hours_video_z <- scale(dat2$hours_video)
# estimate binary regression model
## mlr3verse:
task_z = as_task_regr(score_reading_z ~ hours_video_z, data = dat2)
mdl_z = lrn("regr.lm")
mdl_z$train(task_z)
summary(mdl_z$model)
##
## Call:
## stats::lm(formula = task$formula(), data = task$data())
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.01967 -0.81810 0.08811 0.70341 1.70832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.566e-16 9.759e-02 0.000 1.00000
## hours_video_z -2.582e-01 9.809e-02 -2.632 0.00987 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9711 on 97 degrees of freedom
## Multiple R-squared: 0.06667, Adjusted R-squared: 0.05705
## F-statistic: 6.929 on 1 and 97 DF, p-value: 0.009867
## base R:
#mdl_z <- lm(score_reading_z ~ hours_video_z, data = dat2)
#summary(mdl_z)
# comparison to direct correlation testing
cor.test(dat2$hours_video, dat2$score_reading)
##
## Pearson's product-moment correlation
##
## data: dat2$hours_video and dat2$score_reading
## t = -2.6323, df = 97, p-value = 0.009867
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4335232 -0.0640633
## sample estimates:
## cor
## -0.2582096