RoxygenNote: 7.3.2
RoxygenNote: 7.3.2
Encoding: UTF-8
#' Predictive modeling of flight delays using ridgereg()
#' Ridge Regression Reference Class
#' This class implements ridge regression with methods for initialization, coefficient extraction,
#' prediction, and printing the model summary.
#' @field formula_string A character string representing the regression formula.
#' @field data_string A character string representing the data frame used.
#' @field formula A formula specifying the relationship between dependent and independent variables.
#' @field data A data frame containing the variables specified in the formula.
#' @field lambda A numeric value for the ridge penalty parameter.
#' @field coefficients A matrix containing the estimated ridge regression coefficients.
#' @field fitted_values A matrix of the fitted (predicted) values.
#' @field residuals A matrix of the residuals (difference between actual and fitted values).
#' @method initialize Initializes
#' @param formula A formula specifying the model.
#' @param data A data frame containing the variables in the formula.
#' @param lambda The ridge penalty parameter.
#' @method print Prints
#' @method predict Predicts
#' @param newdata A data frame for which to predict new values.
#' @return A vector of predicted values.
#' @method coef Extracts
#' @return A named vector of the regression coefficients.
#' @importFrom methods new
#' @import dplyr
#' @import ggplot2
#' @import nycflights13
#' Prepare Flight Data
#' @return A preprocessed data frame for modeling.
#' @examples
#' flight_data <- prepare_flight_data()
#' Split Flight Data
#' @return A list containing `train_data`, `validation_data`, and `test_data` data frames.
#' @examples
#' split_data <- split_flight_data(flight_data)
#' Train and Evaluate Ridge Regression Models
#' @return The best lambda value and the test RMSE for the final model.
#' @examples
#' best_lambda <- train_and_evaluate_ridge_regression(train_data, validation_data, test_data)
#' Train the Final Model and Evaluate on Test Set
#' @examples
#' final_rmse <- final_model_evaluation(best_lambda, train_data, validation_data, test_data)
#' @export
#' Ridge Regression Reference Class
#' @field formula A formula object specifying the regression model.
#' @field data A data frame containing the variables in the model.
#' @field lambda A numeric value for the ridge penalty (lambda).
#' @field coefficients A numeric vector of estimated coefficients for the predictors.
#' @field intercept A numeric value for the intercept term.
#' @description
#' This class performs ridge regression using a provided formula, data, and lambda parameter.
#' It normalizes covariates, calculates coefficients using linear algebra, and provides methods
#' to display the model, make predictions, and retrieve coefficients.
#' @examples
#' # Example usage with mtcars dataset
#' mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
#' mod$show()
#' mod$coef()
#' mod <- ridgereg$new(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1)
#' mod$show()
#' mod$coef()
ridgereg <- setRefClass(
fields = list(
formula = "formula",
data = "data.frame",
lambda = "numeric",
coefficients = "numeric",
intercept = "numeric"
methods = list(
#' Initialize the Ridge Regression Model
#' @param formula A formula specifying the regression model.
#' @param data A data frame containing the variables in the model.
#' @param lambda A numeric value for the ridge penalty.
initialize = function(formula, data, lambda) {
formula <<- formula
data <<- data
lambda <<- lambda
X <- model.matrix(formula, data)
y <- data[[as.character(formula[[2]])]]
intercept_column <- X[, 1]
X_predictors <- X[, -1]
X_scaled <- scale(X_predictors, center = TRUE, scale = TRUE)
y_centered <- y - mean(y)
X_standardized <- cbind(Intercept = intercept_column, X_scaled)
I <- diag(ncol(X_standardized))
I[1, 1] <- 0
beta_ridge <- solve(t(X_standardized) %*% X_standardized + lambda * I) %*% t(X_standardized) %*% y_centered
beta_unscaled <- beta_ridge[-1] / attr(X_scaled, "scaled:scale")
intercept_adjusted <- mean(y) - sum(beta_unscaled * attr(X_scaled, "scaled:center"))
coefficients <<- as.numeric(beta_unscaled)
intercept <<- intercept_adjusted
#' Show the model coefficients
#' Prints the intercept and coefficients for the ridge regression model.
show = function() {
cat("Ridge Regression Coefficients:\n")
cat("Intercept:", intercept, "\n")
#' Predict using the Ridge Regression Model
#' @param newdata Optional data frame for making predictions. If NULL, predictions are made on the training data.
#' @return Predicted values as a numeric vector.
predict = function(newdata = NULL) {
if (is.null(newdata)) {
X <- model.matrix(formula, data)
} else {
X <- model.matrix(formula, newdata)
X %*% c(intercept, coefficients)
#' Get the Coefficients
#' @return A named numeric vector of the intercept and coefficients.
coef = function() {
c(Intercept = intercept, coefficients)
#' Prepare Flight Data
#' @description
#' Preprocesses the `flights` and `weather` data from the `nycflights13` package.
#' The function merges, filters, and transforms the data, creating interaction terms.
#' It removes any rows with NA values.
#' @return A preprocessed data frame for modeling.
#' @examples
#' flight_data <- prepare_flight_data()
prepare_flight_data <- function() {
# Merge flights and weather data on common columns (year, month, day, hour, origin)
flight_data <- nycflights13::flights %>%
dplyr::inner_join(nycflights13::weather, by = c("year", "month", "day", "hour", "origin")) %>%
dplyr::filter(! %>%
dplyr::select(dep_delay, origin, month, day, hour, temp, dewp, humid, wind_speed, precip, visib) %>%
temp_wind_interaction = temp * wind_speed, # Example of an interaction term
humid_precip_interaction = humid * precip
) %>%
dplyr::select(-origin) %>% # Remove categorical column to keep numeric features
na.omit() # Remove any rows with NA values
# Prepare the data
flight_data <- prepare_flight_data()
#' Split Flight Data
#' @description
#' Splits the flight data into training, validation, and test sets. The training set contains
#' 80% of the data, while the validation and test sets contain 15% and 5%, respectively.
#' @return A list containing `train_data`, `validation_data`, and `test_data` data frames.
#' @examples
#' split_data <- split_flight_data(flight_data)
trainIndex <- caret::createDataPartition(flight_data$dep_delay, p = 0.8, list = FALSE)
train_data <- flight_data[trainIndex, ]
temp_data <- flight_data[-trainIndex, ]
# Further split temp_data into validation (15%) and test (5%)
validationIndex <- caret::createDataPartition(temp_data$dep_delay, p = 0.75, list = FALSE)
validation_data <- temp_data[validationIndex, ]
test_data <- temp_data[-validationIndex, ]
#' Train and Evaluate Ridge Regression Models
#' @description
#' Trains ridge regression models with different lambda values and evaluates them on a validation set.
#' The best lambda is chosen based on the lowest RMSE on the validation set.
#' @return The best lambda value and the test RMSE for the final model.
#' @examples
#' best_lambda <- train_and_evaluate_ridge_regression(train_data, validation_data, test_data)
lambdas <- c(0.1, 1, 10, 100) # Set of lambda values to try
validation_rmse <- numeric(length(lambdas))
for (i in seq_along(lambdas)) {
# Train a ridgereg model on the training data
model <- ridgereg$new(dep_delay ~ ., data = train_data, lambda = lambdas[i])
# Predict on validation data
predictions <- model$predict(validation_data)
# Calculate RMSE on validation set
validation_rmse[i] <- sqrt(mean((validation_data$dep_delay - predictions)^2))
# Choose the best lambda based on lowest RMSE
best_lambda <- lambdas[which.min(validation_rmse)]
#' Train the Final Model and Evaluate on Test Set
#' @description
#' Trains the final ridge regression model using the best lambda value on the combined
#' training and validation data. Evaluates the model on the test set and calculates the RMSE.
#' @examples
#' final_rmse <- final_model_evaluation(best_lambda, train_data, validation_data, test_data)
final_model <- ridgereg$new(dep_delay ~ ., data = rbind(train_data, validation_data), lambda = best_lambda)
test_predictions <- final_model$predict(test_data)
test_rmse <- sqrt(mean((test_data$dep_delay - test_predictions)^2))
# Print the test RMSE
cat("Best lambda:", best_lambda, "\n")
cat("Test RMSE:", test_rmse, "\n")

