changes in .r file

34fd5d9b · Dhanush Kumar Reddy Narayana Reddy · 9373a56e · 34fd5d9b
Commit 34fd5d9b authored 4 months ago by Dhanush Kumar Reddy Narayana Reddy
--- a/R/predictive_modelling_flight_delay.R
+++ b/R/predictive_modelling_flight_delay.R
-# Load necessary libraries
-library(dplyr)
-library(nycflights13)
-library(caret)  # For data splitting and model evaluation
-
-# Assuming ridgereg is already defined and available
-# Load necessary package
-library(methods)
-
-# Define the ridgereg Reference Class for Ridge Regression
-ridgereg <- setRefClass("ridgereg",
-                        fields = list(
-                          formula = "formula",
-                          data = "data.frame",
-                          lambda = "numeric",
-                          coefficients = "numeric",
-                          intercept = "numeric"  # Intercept term
-                        ),
-
-                        methods = list(
-                          # Initialize method to set up the object
-                          initialize = function(formula, data, lambda) {
-                            .self$formula <- formula
-                            .self$data <- data
-                            .self$lambda <- lambda
-                            # Perform ridge regression computations upon initialization
-                            .self$compute_coefficients()
-                          },
-
-                          # Method to compute ridge regression coefficients with standardization
-                          compute_coefficients = function() {
-                            # Create model matrix and response vector
-                            X <- model.matrix(.self$formula, .self$data)
-                            y <- .self$data[[as.character(.self$formula[[2]])]]
-
-                            # Separate intercept column from X and center/scale only the predictors
-                            intercept_column <- X[, 1]  # The intercept column (all 1's)
-                            X_predictors <- X[, -1]      # Predictor columns only
-                            X_scaled <- scale(X_predictors, center = TRUE, scale = TRUE)
-                            y_centered <- y - mean(y)
-
-                            # Reconstruct X with intercept column and scaled predictors
-                            X_standardized <- cbind(Intercept = intercept_column, X_scaled)
-
-                            # Identity matrix for regularization (exclude intercept)
-                            I <- diag(ncol(X_standardized))
-                            I[1, 1] <- 0  # Do not regularize the intercept
-
-                            # Ridge regression calculation on standardized data
-                            beta_ridge <- solve(t(X_standardized) %*% X_standardized + .self$lambda * I) %*% t(X_standardized) %*% y_centered
-
-                            # Adjust coefficients back to original scale
-                            beta_unscaled <- beta_ridge[-1] / attr(X_scaled, "scaled:scale")  # Coefficients (excluding intercept)
-                            intercept_adjusted <- mean(y) - sum(beta_unscaled * attr(X_scaled, "scaled:center"))
-
-                            # Store coefficients
-                            .self$coefficients <- as.numeric(beta_unscaled)  # Convert matrix to numeric vector
-                            .self$intercept <- intercept_adjusted
-                          },
-
-                          # Method to print the model coefficients
-                          show = function() {
-                            cat("Ridge Regression Coefficients:\n")
-                            cat("Intercept:", .self$intercept, "\n")
-                            cat("Coefficients:\n")
-                            print(.self$coefficients)
-                          },
-
-                          # Method to predict new values
-                          predict = function(newdata = NULL) {
-                            if (is.null(newdata)) {
-                              X <- model.matrix(.self$formula, .self$data)
-                            } else {
-                              X <- model.matrix(.self$formula, newdata)
-                            }
-                            X %*% c(.self$intercept, .self$coefficients)
-                          },
-
-                          # Method to return the coefficients
-                          coef = function() {
-                            c(Intercept = .self$intercept, .self$coefficients)
-                          }
-                        ))
-
-# Example usage with mtcars dataset
-mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
-mod$show()
-mod$coef()
-
-mod <- ridgereg$new(formula = Sepal.Length ~ Sepal.Width+Petal.Length, data = iris, lambda = 0.1)
-mod$show()  # Use 'show' instead of 'print'
-#mod$predict(iris)
-mod$coef()
-
-
-
-# Updated Step 1: Preprocess and filter data to remove NA rows
+#' Predictive modeling of flight delays using ridgereg()
+#'
+#' Ridge Regression Reference Class
+#'
+#' This class implements ridge regression with methods for initialization, coefficient extraction,
+#' prediction, and printing the model summary.
+#'
+#' @field formula_string A character string representing the regression formula.
+#' @field data_string A character string representing the data frame used.
+#' @field formula A formula specifying the relationship between dependent and independent variables.
+#' @field data A data frame containing the variables specified in the formula.
+#' @field lambda A numeric value for the ridge penalty parameter.
+#' @field coefficients A matrix containing the estimated ridge regression coefficients.
+#' @field fitted_values A matrix of the fitted (predicted) values.
+#' @field residuals A matrix of the residuals (difference between actual and fitted values).
+#' @method initialize Initializes
+#' @param formula A formula specifying the model.
+#' @param data A data frame containing the variables in the formula.
+#' @param lambda The ridge penalty parameter.
+#'
+#' @method print Prints
+#'
+#' @method predict Predicts
+#' @param newdata A data frame for which to predict new values.
+#' @return A vector of predicted values.
+#'
+#' @method coef Extracts
+#' @return A named vector of the regression coefficients.
+#' @importFrom methods new
+#'
+#' @import dplyr
+#' @import ggplot2
+#' @import nycflights13
+#'
+#' Prepare Flight Data
+#'
+#'
+#' @return A preprocessed data frame for modeling.
+#'
+#' @examples
+#' flight_data <- prepare_flight_data()
+#'
+#'
+#' Split Flight Data
+#'
+#'
+#' @return A list containing `train_data`, `validation_data`, and `test_data` data frames.
+#'
+#' @examples
+#' split_data <- split_flight_data(flight_data)
+#'
+#' Train and Evaluate Ridge Regression Models
+#'
+#'
+#' @return The best lambda value and the test RMSE for the final model.
+#'
+#' @examples
+#' best_lambda <- train_and_evaluate_ridge_regression(train_data, validation_data, test_data)
+#'
+#' Train the Final Model and Evaluate on Test Set
+#' @examples
+#' final_rmse <- final_model_evaluation(best_lambda, train_data, validation_data, test_data)
+#'
+#' @export
+#' Ridge Regression Reference Class
+#'
+#' @field formula A formula object specifying the regression model.
+#' @field data A data frame containing the variables in the model.
+#' @field lambda A numeric value for the ridge penalty (lambda).
+#' @field coefficients A numeric vector of estimated coefficients for the predictors.
+#' @field intercept A numeric value for the intercept term.
+#'
+#' @description
+#' This class performs ridge regression using a provided formula, data, and lambda parameter.
+#' It normalizes covariates, calculates coefficients using linear algebra, and provides methods
+#' to display the model, make predictions, and retrieve coefficients.
+#'
+#' @examples
+#' # Example usage with mtcars dataset
+#' mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
+#' mod$show()
+#' mod$coef()
+#'
+#' mod <- ridgereg$new(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1)
+#' mod$show()
+#' mod$coef()
+ridgereg <- setRefClass(
+  "ridgereg",
+  fields = list(
+    formula = "formula",
+    data = "data.frame",
+    lambda = "numeric",
+    coefficients = "numeric",
+    intercept = "numeric"
+  ),
+  methods = list(
+    #' Initialize the Ridge Regression Model
+    #'
+    #' @param formula A formula specifying the regression model.
+    #' @param data A data frame containing the variables in the model.
+    #' @param lambda A numeric value for the ridge penalty.
+    initialize = function(formula, data, lambda) {
+      formula <<- formula
+      data <<- data
+      lambda <<- lambda
+      X <- model.matrix(formula, data)
+      y <- data[[as.character(formula[[2]])]]
+      intercept_column <- X[, 1]
+      X_predictors <- X[, -1]
+      X_scaled <- scale(X_predictors, center = TRUE, scale = TRUE)
+      y_centered <- y - mean(y)
+      X_standardized <- cbind(Intercept = intercept_column, X_scaled)
+      I <- diag(ncol(X_standardized))
+      I[1, 1] <- 0
+      beta_ridge <- solve(t(X_standardized) %*% X_standardized + lambda * I) %*% t(X_standardized) %*% y_centered
+      beta_unscaled <- beta_ridge[-1] / attr(X_scaled, "scaled:scale")
+      intercept_adjusted <- mean(y) - sum(beta_unscaled * attr(X_scaled, "scaled:center"))
+      coefficients <<- as.numeric(beta_unscaled)
+      intercept <<- intercept_adjusted
+    },
+
+    #' Show the model coefficients
+    #'
+    #' Prints the intercept and coefficients for the ridge regression model.
+    show = function() {
+      cat("Ridge Regression Coefficients:\n")
+      cat("Intercept:", intercept, "\n")
+      cat("Coefficients:\n")
+      print(coefficients)
+    },
+
+    #' Predict using the Ridge Regression Model
+    #'
+    #' @param newdata Optional data frame for making predictions. If NULL, predictions are made on the training data.
+    #' @return Predicted values as a numeric vector.
+    predict = function(newdata = NULL) {
+      if (is.null(newdata)) {
+        X <- model.matrix(formula, data)
+      } else {
+        X <- model.matrix(formula, newdata)
+      }
+      X %*% c(intercept, coefficients)
+    },
+
+    #' Get the Coefficients
+    #'
+    #' @return A named numeric vector of the intercept and coefficients.
+    coef = function() {
+      c(Intercept = intercept, coefficients)
+    }
+  )
+)
+
+
+#' Prepare Flight Data
+#'
+#' @description
+#' Preprocesses the `flights` and `weather` data from the `nycflights13` package.
+#' The function merges, filters, and transforms the data, creating interaction terms.
+#' It removes any rows with NA values.
+#'
+#' @return A preprocessed data frame for modeling.
+#'
+#' @examples
+#' flight_data <- prepare_flight_data()
 prepare_flight_data <- function() {
  # Merge flights and weather data on common columns (year, month, day, hour, origin)
-  flight_data <- flights %>%
-    inner_join(weather, by = c("year", "month", "day", "hour", "origin")) %>%
-    filter(!is.na(dep_delay)) %>%
-    select(dep_delay, origin, month, day, hour, temp, dewp, humid, wind_speed, precip, visib) %>%
-    mutate(
+  flight_data <- nycflights13::flights %>%
+    dplyr::inner_join(nycflights13::weather, by = c("year", "month", "day", "hour", "origin")) %>%
+    dplyr::filter(!is.na(dep_delay)) %>%
+    dplyr::select(dep_delay, origin, month, day, hour, temp, dewp, humid, wind_speed, precip, visib) %>%
+    dplyr::mutate(
      temp_wind_interaction = temp * wind_speed,  # Example of an interaction term
      humid_precip_interaction = humid * precip
    ) %>%
-    select(-origin) %>%  # Remove categorical column to keep numeric features
+    dplyr::select(-origin) %>%  # Remove categorical column to keep numeric features
    na.omit()  # Remove any rows with NA values

  return(flight_data)
 }

+
 # Prepare the data
 flight_data <- prepare_flight_data()


-# Step 2: Split data into train, validation, and test sets
+#' Split Flight Data
+#'
+#' @description
+#' Splits the flight data into training, validation, and test sets. The training set contains
+#' 80% of the data, while the validation and test sets contain 15% and 5%, respectively.
+#'
+#' @return A list containing `train_data`, `validation_data`, and `test_data` data frames.
+#'
+#' @examples
+#' split_data <- split_flight_data(flight_data)
 set.seed(123)
-trainIndex <- createDataPartition(flight_data$dep_delay, p = 0.8, list = FALSE)
+trainIndex <- caret::createDataPartition(flight_data$dep_delay, p = 0.8, list = FALSE)
 train_data <- flight_data[trainIndex, ]
 temp_data <- flight_data[-trainIndex, ]

 # Further split temp_data into validation (15%) and test (5%)
-validationIndex <- createDataPartition(temp_data$dep_delay, p = 0.75, list = FALSE)
+validationIndex <- caret::createDataPartition(temp_data$dep_delay, p = 0.75, list = FALSE)
 validation_data <- temp_data[validationIndex, ]
 test_data <- temp_data[-validationIndex, ]

-# Step 3: Train ridge regression models with different lambdas and evaluate RMSE
+
+#' Train and Evaluate Ridge Regression Models
+#'
+#' @description
+#' Trains ridge regression models with different lambda values and evaluates them on a validation set.
+#' The best lambda is chosen based on the lowest RMSE on the validation set.
+#'
+#' @return The best lambda value and the test RMSE for the final model.
+#'
+#' @examples
+#' best_lambda <- train_and_evaluate_ridge_regression(train_data, validation_data, test_data)
 lambdas <- c(0.1, 1, 10, 100)  # Set of lambda values to try
 validation_rmse <- numeric(length(lambdas))

@@ -144,7 +232,14 @@ for (i in seq_along(lambdas)) {
 # Choose the best lambda based on lowest RMSE
 best_lambda <- lambdas[which.min(validation_rmse)]

-# Step 4: Train the final model using the best lambda and evaluate on test set
+#' Train the Final Model and Evaluate on Test Set
+#'
+#' @description
+#' Trains the final ridge regression model using the best lambda value on the combined
+#' training and validation data. Evaluates the model on the test set and calculates the RMSE.
+#'
+#' @examples
+#' final_rmse <- final_model_evaluation(best_lambda, train_data, validation_data, test_data)
 final_model <- ridgereg$new(dep_delay ~ ., data = rbind(train_data, validation_data), lambda = best_lambda)
 test_predictions <- final_model$predict(test_data)
 test_rmse <- sqrt(mean((test_data$dep_delay - test_predictions)^2))