Skip to content
Snippets Groups Projects
Commit efc9f0fc authored by Dhanush Kumar Reddy Narayana Reddy's avatar Dhanush Kumar Reddy Narayana Reddy
Browse files

edit in vignettes

parent 34fd5d9b
No related branches found
No related tags found
No related merge requests found
......@@ -11,7 +11,7 @@ Imports:
ggplot2,
dplyr,
nycflights13,
methods
methods,
RoxygenNote: 7.3.2
Encoding: UTF-8
Suggests:
......
#' Predictive modeling of flight delays using ridgereg()
#'
#' Ridge Regression Reference Class
#'
#' This class implements ridge regression with methods for initialization, coefficient extraction,
#' prediction, and printing the model summary.
#'
#' @field formula_string A character string representing the regression formula.
#' @field data_string A character string representing the data frame used.
#' @field formula A formula specifying the relationship between dependent and independent variables.
#' @field data A data frame containing the variables specified in the formula.
#' @field lambda A numeric value for the ridge penalty parameter.
#' @field coefficients A matrix containing the estimated ridge regression coefficients.
#' @field fitted_values A matrix of the fitted (predicted) values.
#' @field residuals A matrix of the residuals (difference between actual and fitted values).
#' @method initialize Initializes
#' @param formula A formula specifying the model.
#' @param data A data frame containing the variables in the formula.
#' @param lambda The ridge penalty parameter.
#'
#' @method print Prints
#'
#' @method predict Predicts
#' @param newdata A data frame for which to predict new values.
#' @return A vector of predicted values.
#'
#' @method coef Extracts
#' @return A named vector of the regression coefficients.
#' @importFrom methods new
#'
#' @import dplyr
#' @import ggplot2
#' @import nycflights13
#'
#' Prepare Flight Data
#'
#'
#' @return A preprocessed data frame for modeling.
#'
#' @examples
#' flight_data <- prepare_flight_data()
#'
#'
#' Split Flight Data
#'
#'
#' @return A list containing `train_data`, `validation_data`, and `test_data` data frames.
#'
#' @examples
#' split_data <- split_flight_data(flight_data)
#'
#' Train and Evaluate Ridge Regression Models
#'
#'
#' @return The best lambda value and the test RMSE for the final model.
#'
#' @examples
#' best_lambda <- train_and_evaluate_ridge_regression(train_data, validation_data, test_data)
#'
#' Train the Final Model and Evaluate on Test Set
#' @examples
#' final_rmse <- final_model_evaluation(best_lambda, train_data, validation_data, test_data)
#'
#' @export
#' Ridge Regression Reference Class
#'
#' @field formula A formula object specifying the regression model.
#' @field data A data frame containing the variables in the model.
#' @field lambda A numeric value for the ridge penalty (lambda).
#' @field coefficients A numeric vector of estimated coefficients for the predictors.
#' @field intercept A numeric value for the intercept term.
#'
#' @description
#' This class performs ridge regression using a provided formula, data, and lambda parameter.
#' It normalizes covariates, calculates coefficients using linear algebra, and provides methods
#' to display the model, make predictions, and retrieve coefficients.
#'
#' @examples
#' # Example usage with mtcars dataset
#' mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
#' mod$show()
#' mod$coef()
#'
#' mod <- ridgereg$new(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1)
#' mod$show()
#' mod$coef()
ridgereg <- setRefClass(
"ridgereg",
fields = list(
formula = "formula",
data = "data.frame",
lambda = "numeric",
coefficients = "numeric",
intercept = "numeric"
),
methods = list(
#' Initialize the Ridge Regression Model
#'
#' @param formula A formula specifying the regression model.
#' @param data A data frame containing the variables in the model.
#' @param lambda A numeric value for the ridge penalty.
initialize = function(formula, data, lambda) {
formula <<- formula
data <<- data
lambda <<- lambda
X <- model.matrix(formula, data)
y <- data[[as.character(formula[[2]])]]
intercept_column <- X[, 1]
X_predictors <- X[, -1]
X_scaled <- scale(X_predictors, center = TRUE, scale = TRUE)
y_centered <- y - mean(y)
X_standardized <- cbind(Intercept = intercept_column, X_scaled)
I <- diag(ncol(X_standardized))
I[1, 1] <- 0
beta_ridge <- solve(t(X_standardized) %*% X_standardized + lambda * I) %*% t(X_standardized) %*% y_centered
beta_unscaled <- beta_ridge[-1] / attr(X_scaled, "scaled:scale")
intercept_adjusted <- mean(y) - sum(beta_unscaled * attr(X_scaled, "scaled:center"))
coefficients <<- as.numeric(beta_unscaled)
intercept <<- intercept_adjusted
},
#' Show the model coefficients
#'
#' Prints the intercept and coefficients for the ridge regression model.
show = function() {
cat("Ridge Regression Coefficients:\n")
cat("Intercept:", intercept, "\n")
cat("Coefficients:\n")
print(coefficients)
},
#' Predict using the Ridge Regression Model
#'
#' @param newdata Optional data frame for making predictions. If NULL, predictions are made on the training data.
#' @return Predicted values as a numeric vector.
predict = function(newdata = NULL) {
if (is.null(newdata)) {
X <- model.matrix(formula, data)
} else {
X <- model.matrix(formula, newdata)
}
X %*% c(intercept, coefficients)
},
#' Get the Coefficients
#'
#' @return A named numeric vector of the intercept and coefficients.
coef = function() {
c(Intercept = intercept, coefficients)
}
)
)
#' Prepare Flight Data
#'
#' @description
#' Preprocesses the `flights` and `weather` data from the `nycflights13` package.
#' The function merges, filters, and transforms the data, creating interaction terms.
#' It removes any rows with NA values.
#'
#' @return A preprocessed data frame for modeling.
#'
#' @examples
#' flight_data <- prepare_flight_data()
prepare_flight_data <- function() {
# Merge flights and weather data on common columns (year, month, day, hour, origin)
flight_data <- nycflights13::flights %>%
dplyr::inner_join(nycflights13::weather, by = c("year", "month", "day", "hour", "origin")) %>%
dplyr::filter(!is.na(dep_delay)) %>%
dplyr::select(dep_delay, origin, month, day, hour, temp, dewp, humid, wind_speed, precip, visib) %>%
dplyr::mutate(
temp_wind_interaction = temp * wind_speed, # Example of an interaction term
humid_precip_interaction = humid * precip
) %>%
dplyr::select(-origin) %>% # Remove categorical column to keep numeric features
na.omit() # Remove any rows with NA values
return(flight_data)
}
# Prepare the data
flight_data <- prepare_flight_data()
#' Split Flight Data
#'
#' @description
#' Splits the flight data into training, validation, and test sets. The training set contains
#' 80% of the data, while the validation and test sets contain 15% and 5%, respectively.
#'
#' @return A list containing `train_data`, `validation_data`, and `test_data` data frames.
#'
#' @examples
#' split_data <- split_flight_data(flight_data)
set.seed(123)
trainIndex <- caret::createDataPartition(flight_data$dep_delay, p = 0.8, list = FALSE)
train_data <- flight_data[trainIndex, ]
temp_data <- flight_data[-trainIndex, ]
# Further split temp_data into validation (15%) and test (5%)
validationIndex <- caret::createDataPartition(temp_data$dep_delay, p = 0.75, list = FALSE)
validation_data <- temp_data[validationIndex, ]
test_data <- temp_data[-validationIndex, ]
#' Train and Evaluate Ridge Regression Models
#'
#' @description
#' Trains ridge regression models with different lambda values and evaluates them on a validation set.
#' The best lambda is chosen based on the lowest RMSE on the validation set.
#'
#' @return The best lambda value and the test RMSE for the final model.
#'
#' @examples
#' best_lambda <- train_and_evaluate_ridge_regression(train_data, validation_data, test_data)
lambdas <- c(0.1, 1, 10, 100) # Set of lambda values to try
validation_rmse <- numeric(length(lambdas))
for (i in seq_along(lambdas)) {
# Train a ridgereg model on the training data
model <- ridgereg$new(dep_delay ~ ., data = train_data, lambda = lambdas[i])
# Predict on validation data
predictions <- model$predict(validation_data)
# Calculate RMSE on validation set
validation_rmse[i] <- sqrt(mean((validation_data$dep_delay - predictions)^2))
}
# Choose the best lambda based on lowest RMSE
best_lambda <- lambdas[which.min(validation_rmse)]
#' Train the Final Model and Evaluate on Test Set
#'
#' @description
#' Trains the final ridge regression model using the best lambda value on the combined
#' training and validation data. Evaluates the model on the test set and calculates the RMSE.
#'
#' @examples
#' final_rmse <- final_model_evaluation(best_lambda, train_data, validation_data, test_data)
final_model <- ridgereg$new(dep_delay ~ ., data = rbind(train_data, validation_data), lambda = best_lambda)
test_predictions <- final_model$predict(test_data)
test_rmse <- sqrt(mean((test_data$dep_delay - test_predictions)^2))
# Print the test RMSE
cat("Best lambda:", best_lambda, "\n")
cat("Test RMSE:", test_rmse, "\n")
......@@ -95,4 +95,3 @@ This function visualizes the mean flight delay at each origin airport in the nyc
```{r}
visualize_airport_delays()
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment