## ----setup, include=FALSE----------------------------------------------------- library(visStatistics) knitr::opts_chunk$set( fig.width = 7.5, # Smaller device = smaller text fig.height = 5, out.width = '100%') ## ----install-github, eval = FALSE--------------------------------------------- # install_github("shhschilling/visStatistics") ## ----load, eval = FALSE------------------------------------------------------- # library(visStatistics) ## ----npk, fig.show='hide'----------------------------------------------------- #Standardised form visstat(npk$block,npk$yield) # Using formula interface visstat(yield~block,data=npk) #Backward-compatible form visstat(npk,"yield","block") ## ----fig-overview, echo=FALSE, results='asis'--------------------------------- if (knitr::opts_knit$get("rmarkdown.pandoc.to") == "html") { cat('
Overview of implemented tests .

Overview of the implemented statistical tests based on the class of the variables.

') } else { cat(' \\begin{center} \\fbox{% \\begin{minipage}{0.95\\linewidth} \\centering \\includegraphics[width=\\linewidth]{figures/overview.png}\\\\ \\vspace{0.5em} \\textit{Decision tree used to select the appropriate statistical test for a categorical predictor and numeric response, based on the number of factor levels, normality, and homoscedasticity.} \\end{minipage} } \\end{center} ') } ## ----fig-decision-switch, echo=FALSE, results='asis'-------------------------- if (knitr::opts_knit$get("rmarkdown.pandoc.to") == "html") { cat('
Decision tree used to select the appropriate statistical test.

Decision tree used to select the appropriate statistical test for a categorical predictor and numeric response, based on the sample sizes, number of factor levels, normality, and homoscedasticity.

') } else { cat(' \\begin{center} \\fbox{% \\begin{minipage}{0.95\\linewidth} \\centering \\includegraphics[width=\\linewidth]{figures/decision_tree.png}\\\\ \\vspace{0.5em} \\textit{Decision tree used to select the appropriate statistical test for a categorical predictor and numeric response, based on the sample sizes, number of factor levels, normality, and homoscedasticity.} \\end{minipage} } \\end{center} ') } ## ----------------------------------------------------------------------------- mtcars$am <- as.factor(mtcars$am) t_test_statistics <- visstat(mtcars$am, mtcars$mpg) ## ----------------------------------------------------------------------------- grades_gender <- data.frame( sex = as.factor(c(rep("girl", 21), rep("boy", 23))), grade = c( 19.3, 18.1, 15.2, 18.3, 7.9, 6.2, 19.4, 20.3, 9.3, 11.3, 18.2, 17.5, 10.2, 20.1, 13.3, 17.2, 15.1, 16.2, 17.0, 16.5, 5.1, 15.3, 17.1, 14.8, 15.4, 14.4, 7.5, 15.5, 6.0, 17.4, 7.3, 14.3, 13.5, 8.0, 19.5, 13.4, 17.9, 17.7, 16.4, 15.6, 17.3, 19.9, 4.4, 2.1 ) ) wilcoxon_statistics <- visstat(grades_gender$sex, grades_gender$grade) ## ----ordinal, fig.show='hide'------------------------------------------------- set.seed(123) # Create predictor: Customer segment (2 groups) segment <- factor(rep(c("Budget", "Premium"), each = 50)) # Create response: Likert scale ratings (1-5) satisfaction_numeric <- c( sample(1:5, 50, replace = TRUE, prob = c(0.15, 0.25, 0.30, 0.20, 0.10)), # Budget sample(1:5, 50, replace = TRUE, prob = c(0.05, 0.10, 0.20, 0.35, 0.30)) # Premium ) # Create dataframe with ORDERED response survey_data <- data.frame( segment = segment, satisfaction = ordered(satisfaction_numeric) # Declare as ordered ) # triggers warnings and use Wilcoxon test wilcox_ordered <- visstat(survey_data, "satisfaction", "segment") ## ----------------------------------------------------------------------------- anova_npk <- visstat(npk$block,npk$yield,conf.level=0.95) ## ----results='hide'----------------------------------------------------------- visstat(iris$Species, iris$Petal.Width) ## ----ordinal-kruskal, fig.show='hide'----------------------------------------- set.seed(123) # Predictor: customer segment (3 groups) segment <- factor(rep(c("Budget", "Standard", "Premium"), each = 50)) # Response: Likert scale ratings (1-5), with a deliberate trend across segments comfort_numeric <- c( sample(1:5, 50, replace = TRUE, prob = c(0.30, 0.30, 0.20, 0.15, 0.05)), # Budget sample(1:5, 50, replace = TRUE, prob = c(0.10, 0.20, 0.40, 0.20, 0.10)), # Standard sample(1:5, 50, replace = TRUE, prob = c(0.05, 0.10, 0.20, 0.35, 0.30)) # Premium ) # Dataframe with ORDERED response survey_data_3 <- data.frame( segment = segment, comfort = ordered(comfort_numeric) # Declare as ordered ) # triggers warning and uses Kruskal-Wallis test kruskal_ordered <- visstat(survey_data_3, "comfort", "segment") ## ----------------------------------------------------------------------------- linreg_swiss <- visstat(swiss$Examination, swiss$Fertility, conf.level = 0.99) ## ----fig.show='hide'---------------------------------------------------------- result_ozone0 <- visstat(airquality$Wind, airquality$Ozone) ## ----------------------------------------------------------------------------- result_ozone1 <- visstat(airquality$Wind, airquality$Ozone, correlation = TRUE) ## ----------------------------------------------------------------------------- # Remove zeros to satisfy Gamma requirements airquality_clean <- subset(airquality, Ozone > 0) # Gamma model with log mapping model_gamma <- glm(Ozone ~ Wind, data = airquality_clean, family = Gamma(link = "log")) summary(model_gamma) ## ----echo=FALSE--------------------------------------------------------------- # Plotting the data with Gamma model overlay plot(airquality_clean$Wind, airquality_clean$Ozone, log = "y", pch = 19, col = "darkgrey", xlab = "Wind (mph)", ylab = "Ozone (ppb) [log scale]", main = "Gamma GLM Fit (Log Link)") # Generate predictions for the overlay wind_seq <- seq(min(airquality_clean$Wind), max(airquality_clean$Wind), length.out = 100) preds <- predict(model_gamma, newdata = data.frame(Wind = wind_seq), type = "response") lines(wind_seq, preds, col = "red", lwd = 2) legend("topright", legend = c("Data", "Gamma GLM (log link)"), col = c("darkgrey", "red"), pch = c(19, NA), lty = c(NA, 1), lwd = c(NA, 2)) ## ----------------------------------------------------------------------------- # Extract standardised residuals std_dev_res <- rstandard(model_gamma, type = "deviance") # Validate using the Shapiro-Wilk normality test shapiro.test(std_dev_res) # Validate using the Anderson-Darling normality test nortest::ad.test(std_dev_res) ## ----------------------------------------------------------------------------- HairEyeColourDataFrame <- counts_to_cases(as.data.frame(HairEyeColor)) ## ----------------------------------------------------------------------------- hair_eye_colour_df <- counts_to_cases(as.data.frame(HairEyeColor)) visstat(hair_eye_colour_df$Eye, hair_eye_colour_df$Hair) ## ----------------------------------------------------------------------------- hair_black_brown_eyes_brown_blue <- HairEyeColor[1:2, 1:2, ] # Transform to data frame hair_black_brown_eyes_brown_blue_df <- counts_to_cases(as.data.frame(hair_black_brown_eyes_brown_blue)) # Chi-squared test with Yates' continuity correction visstat(hair_black_brown_eyes_brown_blue_df$Eye, hair_black_brown_eyes_brown_blue_df$Hair) ## ----fisher-data-prep--------------------------------------------------------- hair_eye_colour_male <- HairEyeColor[, , 1] # Slice out a 2 by 2 contingency table black_brown_hazel_green_male <- hair_eye_colour_male[1:2, 3:4] # Transform to data frame black_brown_hazel_green_male <- counts_to_cases(as.data.frame(black_brown_hazel_green_male)) # Fisher test fisher_stats <- visstat(black_brown_hazel_green_male$Eye, black_brown_hazel_green_male$Hair) ## ----fisher-or---------------------------------------------------------------- fisher_stats$estimate # odds ratio fisher_stats$conf.int # 95% CI ## ----kendall-example---------------------------------------------------------- set.seed(42) n <- 150 # Latent scores with deliberate negative monotone association: # higher alcohol consumption (xs) -> lower academic performance (ys) xs <- sample(1:5, n, replace = TRUE) ys <- pmin(5, pmax(1, (6 - xs) + sample(-1:1, n, replace = TRUE))) likert_levels <- c("never", "rarely", "sometimes", "often", "always") likert_levels2 <- c("poor", "fair", "ok", "good", "great") alcohol <- ordered(likert_levels[xs], levels = likert_levels) performance <- ordered(likert_levels2[ys], levels = likert_levels2) kendall_result <- visstat(performance, alcohol, correlation = TRUE) ## ----fisher-save-graphics----------------------------------------------------- # Graphical output written to plotDirectory: In this example # a single bar chart showing absolute counts. # Output file: chi_squared_or_fisher_Hair_Eye.png save_fisher = visstat(black_brown_hazel_green_male$Eye, black_brown_hazel_green_male$Hair, graphicsoutput = "png", plotDirectory = tempdir()) ## ----show-path2--------------------------------------------------------------- paths <- attr(save_fisher, "plot_paths") print(paths) ## ----eval=TRUE---------------------------------------------------------------- file.remove(paths) ## ----visstat-methods, fig.show='hide', results='hide'------------------------- iris_kruskal <- visstat(iris$Species, iris$Petal.Width) ## ----visstat-methods-print-summary-------------------------------------------- print(iris_kruskal) summary(iris_kruskal) ## ----visstat-methods-plot-paths----------------------------------------------- iris_kruskal_stored <- visstat(iris$Species, iris$Petal.Width, graphicsoutput = "pdf", plotName = "iris_kruskal", plotDirectory = tempdir()) plot(iris_kruskal_stored) ## ----visstat-methods-plot-replay---------------------------------------------- plot(iris_kruskal) ## ----visstat-methods-cleanup, echo = FALSE------------------------------------ file.remove(attr(iris_kruskal_stored, "plot_paths"))