## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message=FALSE, warning=FALSE-------------------------------------- # Packages required for this vignette pkgs <- c( "rpart", "e1071", "dplyr", "tidyr", "ggplot2", "rsample", "gridExtra", "kableExtra", "palmerpenguins" ) # Load each package quietly if available invisible(lapply(pkgs, function(pkg) { if (!requireNamespace(pkg, quietly = TRUE)) { stop(sprintf("Package '%s' is required to run this vignette.", pkg)) } })) library(svmodt) ## ----------------------------------------------------------------------------- # Adelie vs Chinstrap penguins_data <- palmerpenguins::penguins |> dplyr::filter(species %in% c("Adelie", "Chinstrap")) |> dplyr::select( species, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g ) |> na.omit() |> dplyr::mutate(species = droplevels(species)) set.seed(234) split_data <- rsample::initial_split(penguins_data, prop = 0.8, strata = species) train_penguins <- rsample::training(split_data) test_penguins <- rsample::testing(split_data) ## ----penguins-train----------------------------------------------------------- # Train basic SVMODT tree_penguins <- svm_split( data = train_penguins, response = "species", max_depth = 3, max_features = 2, feature_method = "mutual", verbose = FALSE ) ## ----penguins-structure------------------------------------------------------- # Print tree structure print(tree_penguins, show_probabilities = TRUE, show_feature_info = TRUE ) ## ----penguins-predict--------------------------------------------------------- # Predict classes only predictions <- predict(tree_penguins, test_penguins) # Predict with probabilities predictions_prob <- predict(tree_penguins, test_penguins, return_probs = TRUE ) # View first few predictions head(data.frame( Actual = test_penguins$species, Predicted = predictions_prob$predictions, Prob_Adelie = round(predictions_prob$probabilities[, "Adelie"], 3), Prob_Chinstrap = round(predictions_prob$probabilities[, "Chinstrap"], 3) ), 10) |> kableExtra::kable(align = "lccc", format = "html", caption = "SVMODT Class Predictions with Associated Probabilites on Palmerpenguins dataset") |> kableExtra::kable_styling(position = "center", full_width = FALSE) ## ----penguins-viz, fig.height=6, fig.width=8---------------------------------- # Visualize tree decision boundaries viz_penguins <- plot(tree_penguins, data = train_penguins, response = "species", plot.type = "boundary", max_depth = 3) viz_penguins$plots$depth_1_Root ## ----penguins-viz-2, fig.height=6, fig.width=8-------------------------------- gridExtra::grid.arrange(viz_penguins$plots$depth_2_Root_L, viz_penguins$plots$depth_2_Root_R, ncol = 2) ## ----penguins-trace----------------------------------------------------------- trace_path(tree_penguins, test_penguins, sample_idx = 1) ## ----wdbc-prep---------------------------------------------------------------- set.seed(234) split_data <- rsample::initial_split(wdbc, prop = 0.8, strata = diagnosis) train_wdbc <- rsample::training(split_data) test_wdbc <- rsample::testing(split_data) ## ----wdbc-train--------------------------------------------------------------- tree_wdbc <- svm_split( data = train_wdbc, response = "diagnosis", max_depth = 4, min_samples = 10, max_features = 2, feature_method = "mutual", class_weights = "balanced", # For Class Imbalance verbose = FALSE ) ## ----wdbc-eval---------------------------------------------------------------- preds_wdbc <- predict(tree_wdbc, test_wdbc) cat("Accuracy:", round(mean(preds_wdbc == test_wdbc$diagnosis), 4), "\n") print(table(Predicted = preds_wdbc, Actual = test_wdbc$diagnosis)) ## ----feature-penalty---------------------------------------------------------- # Train with feature penalty tree_penalty <- svm_split( data = train_penguins, response = "species", max_depth = 4, max_features = 2, feature_method = "cor", penalize_used_features = TRUE, feature_penalty_weight = 0.6, verbose = FALSE ) ## ----dynamic-features--------------------------------------------------------- # Decrease features with depth tree_decrease <- svm_split( data = train_wdbc, response = "diagnosis", max_depth = 5, max_features = 10, max_features_strategy = "decrease", max_features_decrease_rate = 0.7, verbose = FALSE ) # Random feature selection tree_random <- svm_split( data = train_wdbc, response = "diagnosis", max_depth = 4, max_features_strategy = "random", max_features_random_range = c(0.3, 0.8), verbose = FALSE ) ## ----custom-weights----------------------------------------------------------- # Give malignant cases higher weight custom_weights <- c("B" = 1, "M" = 3) tree_custom <- svm_split( data = train_wdbc, response = "diagnosis", max_depth = 4, max_features = 8, class_weights = "custom", custom_class_weights = custom_weights, verbose = FALSE ) ## ----wine-prep---------------------------------------------------------------- set.seed(234) wine$class <- as.factor(wine$class) split_wine <- rsample::initial_split(wine, prop = 0.8, strata = class) train_wine <- rsample::training(split_wine) test_wine <- rsample::testing(split_wine) ## ----wine-train--------------------------------------------------------------- tree_wine <- svm_split( data = train_wine, response = "class", max_depth = 5, max_features = 5, feature_method = "mutual", impurity_measure = "entropy", min_impurity_decrease = 0.01, class_weights = "balanced", penalize_used_features = TRUE, feature_penalty_weight = 0.5, verbose = FALSE ) ## ----wine-multiclass-viz, fig.height=6, fig.width=8--------------------------- plot(tree_wine, data = train_wine, response = "class", plot.type = "surface") ## ----wine-structure----------------------------------------------------------- print(tree_wine, show_probabilities = FALSE, show_feature_info = TRUE, show_penalties = TRUE ) ## ----wine-eval---------------------------------------------------------------- preds_wine <- predict(tree_wine, newdata = test_wine) acc_wine <- mean(preds_wine == test_wine$class) cat("Test accuracy:", round(acc_wine, 4), "\n") conf_mat <- table(Predicted = preds_wine, Actual = test_wine$class) print(conf_mat) ## ----wine-trace--------------------------------------------------------------- # Show how the first test observation is routed through the tree trace_path(tree_wine, test_wine, sample_idx = 1) ## ----comparison--------------------------------------------------------------- # RPART decision tree tree_rpart <- rpart::rpart(diagnosis ~ ., data = train_wdbc, control = rpart::rpart.control(cp = 0.01) ) pred_rpart <- predict(tree_rpart, test_wdbc, type = "class") tree_wdbc <- svm_split( data = train_wdbc, response = "diagnosis", max_depth = 2, feature_method = "mutual", penalize_used_features = TRUE ) # Standard SVM model_svm <- e1071::svm(diagnosis ~ ., data = train_wdbc, probability = TRUE) pred_svm <- predict(model_svm, test_wdbc) # Get SVMODT predictions pred_svmodt <- predict(tree_wdbc, test_wdbc) # Compare accuracies results <- data.frame( Model = c("SVMODT", "RPART", "Linear SVM"), Accuracy = c( mean(pred_svmodt == test_wdbc$diagnosis), mean(pred_rpart == test_wdbc$diagnosis), mean(pred_svm == test_wdbc$diagnosis) ) ) results |> kableExtra::kable( align = "lc", format = "html", digits = 4, caption = "Comparing Test set Accuracy of SVMODT model with a Linear SVM and a Decision Tree" ) |> kableExtra::kable_styling(position = "center", full_width = FALSE)