-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregression.Rmd
175 lines (135 loc) · 4.08 KB
/
regression.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
---
title: "DATA 448 23S2_Assignment1"
output: pdf_document
date: "2023-07-27"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## 1.c Calculate the coefficients
```{r}
Y <- c(6, 12, 18)
X <- c(4, 5, 6)
# Create the design matrix X with an additional column of ones for the intercept
X_matrix <- cbind(1, X)
# Calculate X^T * X
XTX <- t(X_matrix) %*% X_matrix
# Calculate the inverse of X^T * X
XTX_inv <- solve(XTX)
# Calculate X^T * Y
XTY <- t(X_matrix) %*% Y
# Calculate the coefficients β
beta <- XTX_inv %*% XTY
beta
```
## 1.c calculate the residuals
```{r}
# Calculate the predicted values Ŷ
Y_hat <- X_matrix %*% beta
# Calculate the residuals
residuals <- Y - Y_hat
residuals
```
## 1.d Estimate the coefficients using the function lm in R
```{r}
# Given data
Y <- c(6, 12, 18)
X <- c(4, 5, 6)
# Create a data frame with the data
data_df <- data.frame(X = X, Y = Y)
# Fit the linear model using lm
model <- lm(Y ~ X, data = data_df)
# Print the summary of the model
model_summary <- summary(model)
model_summary
```
## 3 generate a simple linear regression model on happy.csv
```{r}
library(readr)
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("ID", "income", "happiness")
head(data)
model <- lm(happiness ~ income, data = data)
model_summary <- summary(model)
model_summary
```
## plot the residual plot
```{r}
library(ggplot2)
# Obtain the residuals from the model
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(data, aes(x = fitted(model), y = residuals)) +
geom_point() +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)
```
## Plot the obeservation and regression line with lable
```{r}
# Plot the observations
par(mfrow=c(1, 1))
plot <- ggplot(data, aes(x = income, y = happiness)) +
geom_point(color = "blue") +
labs(x = "Income", y = "Happiness") +
ggtitle("Scatter Plot of Income vs. Happiness")
# Add the regression line and equation
plot_with_line <- plot +
geom_smooth(method = "lm", se = FALSE, color = "red") +
annotate(
"text", x = max(data$income) - 1, y = max(data$happiness) - 1,
label = paste("Happiness ≈", round(coefficients(model)[1], 3),
"+", round(coefficients(model)[2], 3), "* Income"),
color = "black", size = 4, hjust = 2
)
# Display the plot
print(plot_with_line)
```
## Explore the simple linear regression on calcofi.csv file
```{r}
library(dplyr)
data <- read_csv("calcofi.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("temp", "salnty")
head(data)
# Remove missing values if any
data <- na.omit(data)
# set seed for random sampling
set.seed(1)
sample_size <- round(0.1 * nrow(data))
sample_data <- data %>% sample_n(sample_size)
# Conduct the linear regression
model <- lm(salnty ~ temp, data = sample_data)
# Get the model summary
summary(model)
# Create a scatter plot of the data points
plot <- ggplot(sample_data, aes(x = temp, y = salnty)) +
geom_point(color = "blue", size = 0.5) +
labs(x = "Temperature (°C)", y = "Salinity") +
ggtitle("Scatter Plot of Salinity vs.Temperature ")
# Add the regression line to the plot
plot_with_line <- plot +
geom_smooth(method = "lm", se = FALSE, color = "red", size = 1)
# Display the plot
print(plot_with_line)
```
## Predict water temperature for the given values
```{r}
new_data <- data.frame(temp = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
prediction_table <- data.frame(Water_Temperature = new_data$temp, Predicted_Salinity = round(predictions, 2))
knitr::kable(prediction_table, caption = "Predictions for Water Temperature")
```
## Obtain the residuals from the model
```{r}
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(sample_data, aes(x = fitted(model), y = residuals)) +
geom_point(color = "blue", size = 0.5) +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)
```