Skip to content
Snippets Groups Projects
Commit 68cdb62c authored by Dhanush Kumar Reddy Narayana Reddy's avatar Dhanush Kumar Reddy Narayana Reddy
Browse files

change in files

parent 1c244f0f
No related branches found
No related tags found
No related merge requests found
Package: group5labbonus Package: group5labbonus
Type: Package Type: Package
Title: Ridge Regression Analysis in R Title: Linear Regression and Ridge Regression Analysis in R
Version: 1.0 Version: 1.0
Date: 2024-11-03 Date: 2024-11-03
Author: Manu Jain [aut], Dhanush Kumar Reddy Narayana Reddy [cre] Author: Manu Jain [aut], Dhanush Kumar Reddy Narayana Reddy [cre]
Maintainer: Dhanush Kumar Reddy Narayana Reddy to <dhana004@student.liu.se> Maintainer: Dhanush Kumar Reddy Narayana Reddy to <dhana004@student.liu.se>
Description: A package for fitting ridge regression models with methods for model initialization, coefficient extraction, prediction, and model summary printing. The package is designed for educational and data analysis purposes, providing straightforward access to ridge regression for regression modeling with a regularization parameter. Description: This package provides the linear and ridge regression models using reference class. The package enables users to establish relationships between variables by employing linear algebra while incorporating a regularisation parameter to enhance regression modelling. Users can effectively perform linear regression on datasets, making it a valuable tool for learning and practical data analysis applications.
License: GPL-3 + file LICENSE License: GPL-3 + file LICENSE
Imports: Imports:
ggplot2,
dplyr,
nycflights13,
methods methods
RoxygenNote: 7.3.2 RoxygenNote: 7.3.2
Encoding: UTF-8 Encoding: UTF-8
......
# Generated by roxygen2: do not edit by hand # Generated by roxygen2: do not edit by hand
export(linreg)
export(ridgereg) export(ridgereg)
export(visualize_airport_delays)
import(dplyr)
import(ggplot2)
import(nycflights13)
importFrom(methods,new) importFrom(methods,new)
#' linreg Reference Class
#'
#' A reference class for performing linear regression using the least squares method.
#' Initialize the linreg class
#'
#' @param formula A formula specifying the model.
#' @param data A data frame containing the variables in the formula.
#' @field formula_string A character string representing the regression formula.
#' @field data_string A character string representing the data frame used.
#' @field formula A formula specifying the relationship between dependent and independent variables.
#' @field data A data frame containing the variables specified in the formula.
#' @field coefficients A matrix containing the estimated regression coefficients.
#' @field fitted_values A matrix of the fitted values (predicted values).
#' @field residuals A matrix of the residuals (difference between actual and fitted values).
#' @field df The degrees of freedom, calculated as the number of observations minus the number of predictors.
#' @field residual_variance The estimated residual variance.
#' @field var_coefficients A matrix representing the variance of the coefficients.
#' @field t_values A matrix of t-values for the estimated coefficients.
#' @field p_values A matrix of p-values for the t-tests of the coefficients.
#'
#' @importFrom methods new
#' @export linreg
#' @import ggplot2
linreg <- setRefClass(
"linreg",
fields = list(
formula_string = "character",
data_string = "character",
formula = "formula",
data = "data.frame",
coefficients = "matrix",
fitted_values = "matrix",
residuals = "matrix",
df = "numeric",
residual_variance = "numeric",
var_coefficients = "matrix",
t_values = "matrix",
p_values = "matrix"
),
methods = list(
initialize = function(formula, data) {
formula_string <<- deparse(formula)
data_string <<- deparse(substitute(data))
X <- model.matrix(formula, data)
y <- data[[all.vars(formula)[1]]]
n <- length(y)
p <- ncol(X)
df <<- n - p
beta_cap <- solve(crossprod(X)) %*% t(X) %*% y
coefficients <<- beta_cap
fitted_values <<- X %*% beta_cap
residuals <<- y - fitted_values
residual_variance <<- as.numeric(crossprod(residuals)) / df
var_coefficients <<- residual_variance * solve(crossprod(X))
t_values <<- beta_cap / sqrt(diag(var_coefficients))
p_values <<- 2 * pt(-abs(t_values), df)
},
print = function() {
class_name <- class(.self)[1]
cat("Call:\n")
cat(class_name,"(formula = ",formula_string ,", data = ",data_string,")","\n", sep = "")
cat("\nCoefficients:\n")
coef_names <- rownames(coefficients)
coef_values <- as.vector(coefficients)
names(coef_values) <- coef_names
print.default(coef_values)
},
plot = function() {
# Residuals vs Fitted plot (red points)
p1 <- ggplot2::ggplot(data.frame(Fitted = fitted_values, Residuals = residuals),
ggplot2::aes(x = Fitted, y = Residuals)) +
ggplot2::geom_point() +
ggplot2::stat_summary(fun = "median", colour = "red", size = 1, geom = "path")+
ggplot2::geom_hline(yintercept = 0, linetype = "dashed") +
ggplot2::labs(title = "Residuals vs Fitted", x = "Fitted Values", y = "Residuals")
# Scale-Location Plot (blue points)
p2 <- ggplot2::ggplot(data.frame(Fitted = fitted_values,
Scaled_Residuals = sqrt(residuals/(sqrt(abs(residual_variance))))),
ggplot2::aes(x = Fitted, y = Scaled_Residuals)) +
ggplot2::geom_point() + # Set points to blue
ggplot2::stat_summary(fun = "median", colour = "red", size = 1, geom = "path")+
#geom_hline(yintercept = 0, linetype = "dashed") +
ggplot2::labs(title = "Scale-Location", x = "Fitted Values", y = "\u221A|Standardized Residuals|")
# Return both plots in a list
two_graphs <- list(p1, p2)
return(two_graphs)
},
resid = function() {
return(as.vector(residuals))
},
pred = function() {
return(as.vector(fitted_values))
},
coef = function() {
named_coefs <- setNames(as.vector(coefficients), rownames(coefficients))
return(named_coefs)
},
summary = function() {
summary_df <- data.frame(
Estimate_C = round(as.vector(coefficients), 3),
`Std.Error` = round(sqrt(diag(var_coefficients)), 3),
`t value` = round(as.vector(t_values), 3),
`Pr(>|t|)` = round(as.vector(p_values), 3),
check.names = FALSE
)
sig <- character(length(p_values))
for (i in 1:length(p_values)) {
p_value <- p_values[i]
if (p_value < 0.001) {
sig[i] <- "***"
} else if (p_value < 0.01) {
sig[i] <- "**"
} else if (p_value < 0.05) {
sig[i] <- "*"
} else {
sig[i] <- ""
}
}
summary_df$sign <- sig
cat("Coefficients:\n")
names(summary_df)[5]<-""
print.data.frame(summary_df)
cat("\nResidual standard error:", sqrt(residual_variance), "on", df, "degrees of freedom\n")
}
)
)
#model <- linreg$new(Petal.Length~Species, data = iris)
#model <- linreg$new(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
#model$print()
#model$plot()
#model$resid()
#model$pred()
#model$coef()
#model$summary()
...@@ -26,90 +26,76 @@ ...@@ -26,90 +26,76 @@
#' @return A named vector of the regression coefficients. #' @return A named vector of the regression coefficients.
#' @importFrom methods new #' @importFrom methods new
#' @export ridgereg #' @export ridgereg
# Load necessary package
library(methods)
# Define the ridgereg Reference Class for Ridge Regression # Define the ridgereg Reference Class for Ridge Regression
ridgereg <- setRefClass("ridgereg", ridgereg <- setRefClass(
fields = list( "ridgereg",
formula = "formula", fields = list(
data = "data.frame", formula = "formula",
lambda = "numeric", data = "data.frame",
coefficients = "numeric", lambda = "numeric",
intercept = "numeric" # Intercept term coefficients = "numeric",
), intercept = "numeric" # Intercept term
),
methods = list(
initialize = function(formula, data, lambda) {
formula <<- formula
data <<- data
lambda <<- lambda
methods = list( X <- model.matrix(formula, data)
# Initialize method to set up the object y <- data[[as.character(formula[[2]])]]
initialize = function(formula, data, lambda) { intercept_column <- X[, 1] # The intercept column (all 1's)
.self$formula <- formula X_predictors <- X[, -1] # Predictor columns only
.self$data <- data X_scaled <- scale(X_predictors, center = TRUE, scale = TRUE)
.self$lambda <- lambda y_centered <- y - mean(y)
# Perform ridge regression computations upon initialization
.self$compute_coefficients()
},
# Method to compute ridge regression coefficients with standardization # Reconstruct X with intercept column and scaled predictors
compute_coefficients = function() { X_standardized <- cbind(Intercept = intercept_column, X_scaled)
# Create model matrix and response vector # Identity matrix for regularization (exclude intercept)
X <- model.matrix(.self$formula, .self$data) I <- diag(ncol(X_standardized))
y <- .self$data[[as.character(.self$formula[[2]])]] I[1, 1] <- 0 # Do not regularize the intercept
# Separate intercept column from X and center/scale only the predictors # Ridge regression calculation on standardized data
intercept_column <- X[, 1] # The intercept column (all 1's) beta_ridge <- solve(t(X_standardized) %*% X_standardized + lambda * I) %*% t(X_standardized) %*% y_centered
X_predictors <- X[, -1] # Predictor columns only
X_scaled <- scale(X_predictors, center = TRUE, scale = TRUE)
y_centered <- y - mean(y)
# Reconstruct X with intercept column and scaled predictors # Adjust coefficients back to original scale
X_standardized <- cbind(Intercept = intercept_column, X_scaled) beta_unscaled <- beta_ridge[-1] / attr(X_scaled, "scaled:scale") # Coefficients (excluding intercept)
intercept_adjusted <- mean(y) - sum(beta_unscaled * attr(X_scaled, "scaled:center"))
# Identity matrix for regularization (exclude intercept) # Store coefficients
I <- diag(ncol(X_standardized)) coefficients <<- as.numeric(beta_unscaled) # Convert matrix to numeric vector
I[1, 1] <- 0 # Do not regularize the intercept intercept <<- intercept_adjusted
},
# Ridge regression calculation on standardized data # Method to print the model coefficients
beta_ridge <- solve(t(X_standardized) %*% X_standardized + .self$lambda * I) %*% t(X_standardized) %*% y_centered show = function() {
cat("Ridge Regression Coefficients:\n")
# Adjust coefficients back to original scale cat("Intercept:", intercept, "\n")
beta_unscaled <- beta_ridge[-1] / attr(X_scaled, "scaled:scale") # Coefficients (excluding intercept) cat("Coefficients:\n")
intercept_adjusted <- mean(y) - sum(beta_unscaled * attr(X_scaled, "scaled:center")) print(coefficients)
},
# Store coefficients # Method to predict new values
.self$coefficients <- as.numeric(beta_unscaled) # Convert matrix to numeric vector predict = function(newdata = NULL) {
.self$intercept <- intercept_adjusted if (is.null(newdata)) {
}, X <- model.matrix(formula, data)
} else {
# Method to print the model coefficients X <- model.matrix(formula, newdata)
show = function() { }
cat("Ridge Regression Coefficients:\n") X %*% c(intercept, coefficients)
cat("Intercept:", .self$intercept, "\n") },
cat("Coefficients:\n") # Method to return the coefficients
print(.self$coefficients) coef = function() {
}, c(Intercept = intercept, coefficients)
}
# Method to predict new values ))
predict = function(newdata = NULL) {
if (is.null(newdata)) {
X <- model.matrix(.self$formula, .self$data)
} else {
X <- model.matrix(.self$formula, newdata)
}
X %*% c(.self$intercept, .self$coefficients)
},
# Method to return the coefficients
coef = function() {
c(Intercept = .self$intercept, .self$coefficients)
}
))
# Example usage with mtcars dataset # Example usage with mtcars dataset
#mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1) #mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
#mod$show() #mod$show()
#mod$coef() #mod$coef()
#mod <- ridgereg$new(formula = Sepal.Length ~ Sepal.Width+Petal.Length, data = iris, lambda = 0.1) # mod <- ridgereg$new(formula = Sepal.Length ~ Sepal.Width+Petal.Length, data = iris, lambda = 0.1)
#mod$show() # Use 'show' instead of 'print' # mod$show() # Use 'show' instead of 'print'
#mod$predict(iris) # mod$predict(iris)
#mod$coef() # mod$coef()
install.packages(c("nycflights13", "dplyr", "ggplot2")) #' Visualize Mean Flight Delay by Airport
library(nycflights13) #'
#' This function visualizes the mean flight delay at each origin airport in the nycflights13 dataset.
#' It calculates the mean departure delay for each airport, joins this data with airport locations,
#' and creates a scatter plot showing mean delay as point size and color on a map of airport locations.
#'
#' @return A ggplot2 object representing the visualization of mean flight delays by airport.
#' @import dplyr
#' @import ggplot2
#' @import nycflights13
#' @param dep_delay' dep_delay
#' @param origin origin
#' @param lon lon
#' @param lat lat
#' @param mean_delay mean_dealy
#' @examples
#' # Run the function to visualize mean delays by airport:
#' visualize_airport_delays()
#' @name visualize_airport_delays
#' @export
library(dplyr) library(dplyr)
library(ggplot2)
visualize_airport_delays <- function() { visualize_airport_delays <- function() {
# Join flights and airports datasets to include latitude and longitude # Join flights and airports datasets to include latitude and longitude
airport_delays <- flights %>% airport_delays <- nycflights13::flights %>%
filter(!is.na(dep_delay)) %>% # Exclude rows where departure delay is NA dplyr::filter(!is.na(dep_delay)) %>% # Exclude rows where departure delay is NA
group_by(origin) %>% # Group by airport dplyr::group_by(origin) %>% # Group by airport
summarize(mean_delay = mean(dep_delay, na.rm = TRUE)) %>% dplyr::summarize(mean_delay = mean(dep_delay, na.rm = TRUE)) %>%
left_join(airports, by = c("origin" = "faa")) # Join with airports for lat/lon data dplyr::left_join(nycflights13::airports, by = c("origin" = "faa")) # Join with airports for lat/lon data
# Create the ggplot2 plot # Create the ggplot2 plot
ggplot(airport_delays, aes(x = lon, y = lat)) + ggplot2::ggplot(airport_delays, ggplot2::aes(x = lon, y = lat)) +
geom_point(aes(size = mean_delay, color = mean_delay), alpha = 0.9) + ggplot2::geom_point(ggplot2::aes(size = mean_delay, color = mean_delay), alpha = 0.9) +
scale_color_gradient(low = "blue", high = "red") + ggplot2::scale_color_gradient(low = "blue", high = "red") +
labs(title = "Mean Flight Delay by Airport", ggplot2::labs(
x = "Longitude", title = "Mean Flight Delay by Airport",
y = "Latitude", x = "Longitude",
size = "Mean Delay (min)", y = "Latitude",
color = "Mean Delay (min)") + size = "Mean Delay (min)",
theme_minimal() color = "Mean Delay (min)"
) +
ggplot2::theme_minimal()
} }
# Call the function
visualize_airport_delays() visualize_airport_delays()
...@@ -34,7 +34,9 @@ For more information, see \url{https://en.wikipedia.org/wiki/Ridge_regression} ...@@ -34,7 +34,9 @@ For more information, see \url{https://en.wikipedia.org/wiki/Ridge_regression}
%% ~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~ %% ~~ \code{\link[<pkg>:<pkg>-package]{<pkg>}} ~~
} }
\examples{ \examples{
data(iris) data(iris)
model <- ridgereg$new(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1) model <- ridgereg$new(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris,lambda = 0.1)
%% ~~ Optional simple examples of the most important functions ~~ %% ~~ Optional simple examples of the most important functions ~~
} }
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/linreg.R
\docType{class}
\name{linreg-class}
\alias{linreg-class}
\alias{linreg}
\title{linreg Reference Class}
\arguments{
\item{formula}{A formula specifying the model.}
\item{data}{A data frame containing the variables in the formula.}
}
\description{
A reference class for performing linear regression using the least squares method.
Initialize the linreg class
}
\section{Fields}{
\describe{
\item{\code{formula_string}}{A character string representing the regression formula.}
\item{\code{data_string}}{A character string representing the data frame used.}
\item{\code{formula}}{A formula specifying the relationship between dependent and independent variables.}
\item{\code{data}}{A data frame containing the variables specified in the formula.}
\item{\code{coefficients}}{A matrix containing the estimated regression coefficients.}
\item{\code{fitted_values}}{A matrix of the fitted values (predicted values).}
\item{\code{residuals}}{A matrix of the residuals (difference between actual and fitted values).}
\item{\code{df}}{The degrees of freedom, calculated as the number of observations minus the number of predictors.}
\item{\code{residual_variance}}{The estimated residual variance.}
\item{\code{var_coefficients}}{A matrix representing the variance of the coefficients.}
\item{\code{t_values}}{A matrix of t-values for the estimated coefficients.}
\item{\code{p_values}}{A matrix of p-values for the t-tests of the coefficients.}
}}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/visualize_airport_delay_dplyr.R
\name{visualize_airport_delays}
\alias{visualize_airport_delays}
\title{Visualize Mean Flight Delay by Airport}
\arguments{
\item{dep_delay'}{dep_delay}
\item{origin}{origin}
\item{lon}{lon}
\item{lat}{lat}
\item{mean_delay}{mean_dealy}
}
\value{
A ggplot2 object representing the visualization of mean flight delays by airport.
}
\description{
This function visualizes the mean flight delay at each origin airport in the nycflights13 dataset.
It calculates the mean departure delay for each airport, joins this data with airport locations,
and creates a scatter plot showing mean delay as point size and color on a map of airport locations.
}
\examples{
# Run the function to visualize mean delays by airport:
visualize_airport_delays()
}
data("iris")
Polygon <- setRefClass("Polygon", fields = c("sides"))
square <- Polygon$new(sides = 4)
test_that("lenreg rejects errounous input", {
expect_error(linreg_mod <- linreg$new(formula = Petal.Length~Sepdsal.Width+Sepal.Length, data=iris))
expect_error(linreg_mod <- linreg$new(formula = Petal.Length~Sepdsal.Width+Sepal.Length, data=irfsfdis))
})
test_that("class is correct", {
linreg_mod <- linreg$new(Petal.Length~Sepal.Width+Sepal.Length, data=iris)
expect_true(class(linreg_mod)[1] == "linreg")
})
test_that("print() method works", {
linreg_mod <- linreg$new(Petal.Length~Sepal.Width+Sepal.Length, data=iris)
expect_output(linreg_mod$print(),"linreg\\(formula = Petal\\.Length ~ Sepal\\.Width \\+ Sepal\\.Length, data = iris\\)")
expect_output(linreg_mod$print(),"( )*\\(Intercept\\)( )*Sepal\\.Width( )*Sepal\\.Length")
})
test_that("pred() method works", {
linreg_mod <- linreg$new(Petal.Length~Sepal.Width+Sepal.Length, data=iris)
expect_equal(round(unname(linreg_mod$pred()[c(1,5,7)]),2), c(1.85, 1.53, 1.09))
})
test_that("resid() method works", {
linreg_mod <- linreg$new(Petal.Length~Sepal.Width+Sepal.Length, data=iris)
expect_equal(round(unname(linreg_mod$resid()[c(7,13,27)]),2), c(0.31, -0.58, -0.20))
})
test_that("coef() method works", {
linreg_mod <- linreg$new(Petal.Length~Sepal.Width+Sepal.Length, data=iris)
expect_true(all(round(unname(linreg_mod$coef()),2) %in% c(-2.52, -1.34, 1.78)))
})
test_that("summary() method works", {
linreg_mod <- linreg$new(Petal.Length~Sepal.Width+Sepal.Length, data=iris)
expect_output(linreg_mod$summary(), "\\(Intercept\\)( )*-2.5[0-9]*( )*0.5[0-9]*( )*-4.4[0-9]*( )*.*( )*\\*\\*\\*")
expect_output(linreg_mod$summary(), "Sepal.Width( )*-1.3[0-9]*( )*0.1[0-9]*( )*-10.9[0-9]*( )*.*( )*\\*\\*\\*")
expect_output(linreg_mod$summary(), "Sepal.Length( )*1.7[0-9]*( )*0.0[0-9]*( )*27.5[0-9]*( )*.*( )*\\*\\*\\*")
expect_output(linreg_mod$summary(), "Residual standard error: 0.6[0-9]* on 147 degrees of freedom")
})
library(testthat) library(testthat)
test_that("ridgereg initializes correctly", {
model <- ridgereg$new(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1)
# Check if coefficients are initialized correctly test_that("ridgereg class tests for initialize and coefficient calculation", {
expect_true(is.matrix(model$coefficients)) mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
expect_equal(ncol(model$coefficients), 1) expect_true(!is.null(mod$coefficients))
expect_true(!is.null(mod$intercept))
# Check if fitted values and residuals are matrices })
expect_true(is.matrix(model$fitted_values))
expect_true(is.matrix(model$residuals)) test_that("ridgereg class tests for coef method returns correct output", {
}) mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
coefs <- mod$coef()
test_that("ridgereg predicts correctly", { expect_type(coefs, "double")
model <- ridgereg$new(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1) expect_true("Intercept" %in% names(coefs))
})
# Test if predictions return a numeric vector
preds <- model$pred() test_that("ridgereg class tests for predict method works correctly", {
expect_true(is.numeric(preds)) mod <- ridgereg$new(formula = mpg ~ cyl + disp, data = mtcars, lambda = 0.1)
expect_equal(length(preds), nrow(iris)) preds <- mod$predict()
}) expect_equal(length(preds), nrow(mtcars))
expect_type(preds, "double")
test_that("ridgereg coefficients extraction works", { })
model <- ridgereg$new(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1)
# Test if coefficients are returned as a named numeric vector
coefs <- model$coef()
expect_true(is.numeric(coefs))
expect_true(!is.null(names(coefs)))
})
test_that("ridgereg print method displays correctly", {
model <- ridgereg$new(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris, lambda = 0.1)
# Capture the print output
output <- capture.output(model$print())
# Check if the output contains relevant information
expect_true(any(grepl("Call:", output)))
expect_true(any(grepl("Coefficients:", output)))
expect_true(any(grepl("Sepal.Width", output)))
})
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment