regression.Rmd

---
title: "DATA 448 23S2_Assignment1"
output: pdf_document
date: "2023-07-27"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## 1.c Calculate the coefficients

```{r}
Y <- c(6, 12, 18)
X <- c(4, 5, 6)

# Create the design matrix X with an additional column of ones for the intercept
X_matrix <- cbind(1, X)

# Calculate X^T * X
XTX <- t(X_matrix) %*% X_matrix

# Calculate the inverse of X^T * X
XTX_inv <- solve(XTX)

# Calculate X^T * Y
XTY <- t(X_matrix) %*% Y

# Calculate the coefficients β
beta <- XTX_inv %*% XTY

beta

```

## 1.c calculate the residuals

```{r}
# Calculate the predicted values Ŷ
Y_hat <- X_matrix %*% beta

# Calculate the residuals
residuals <- Y - Y_hat

residuals
```
## 1.d Estimate the coefficients using the function lm in R

```{r}
# Given data
Y <- c(6, 12, 18)
X <- c(4, 5, 6)

# Create a data frame with the data
data_df <- data.frame(X = X, Y = Y)

# Fit the linear model using lm
model <- lm(Y ~ X, data = data_df)

# Print the summary of the model
model_summary <- summary(model)
model_summary
```
## 3 generate a simple linear regression model on happy.csv
```{r}
library(readr) 
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("ID", "income", "happiness")
head(data)

model <- lm(happiness ~ income, data = data)
model_summary <- summary(model)
model_summary
```
## plot the residual plot
```{r}
library(ggplot2)

# Obtain the residuals from the model
residuals <- resid(model)

# Create scatter plot of residuals vs. fitted values
ggplot(data, aes(x = fitted(model), y = residuals)) +
  geom_point() +
  labs(x = "Fitted Values", y = "Residuals") +
  ggtitle("Residuals vs. Fitted Values")

# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)

```
## Plot the obeservation and regression line with lable
```{r}
# Plot the observations
par(mfrow=c(1, 1))
plot <- ggplot(data, aes(x = income, y = happiness)) +
  geom_point(color = "blue") +
  labs(x = "Income", y = "Happiness") +
  ggtitle("Scatter Plot of Income vs. Happiness")

# Add the regression line and equation
plot_with_line <- plot +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  annotate(
    "text", x = max(data$income) - 1, y = max(data$happiness) - 1,
    label = paste("Happiness ≈", round(coefficients(model)[1], 3),
                  "+", round(coefficients(model)[2], 3), "* Income"),
    color = "black", size = 4, hjust = 2
  )

# Display the plot
print(plot_with_line)

```
## Explore the simple linear regression on calcofi.csv file
```{r}
library(dplyr)
data <- read_csv("calcofi.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("temp", "salnty")
head(data)

# Remove missing values if any
data <- na.omit(data)

# set seed for random sampling
set.seed(1)
sample_size <- round(0.1 * nrow(data))
sample_data <- data %>% sample_n(sample_size)

# Conduct the linear regression
model <- lm(salnty ~ temp, data = sample_data)

# Get the model summary
summary(model)

# Create a scatter plot of the data points
plot <- ggplot(sample_data, aes(x = temp, y = salnty)) +
  geom_point(color = "blue", size = 0.5) +
  labs(x = "Temperature (°C)", y = "Salinity") +
  ggtitle("Scatter Plot of Salinity vs.Temperature ")

# Add the regression line to the plot
plot_with_line <- plot +
  geom_smooth(method = "lm", se = FALSE, color = "red", size = 1)

# Display the plot
print(plot_with_line)
```

## Predict water temperature for the given values
```{r}
new_data <- data.frame(temp = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
prediction_table <- data.frame(Water_Temperature = new_data$temp, Predicted_Salinity = round(predictions, 2))
knitr::kable(prediction_table, caption = "Predictions for Water Temperature")
```

## Obtain the residuals from the model
```{r}
residuals <- resid(model)


# Create scatter plot of residuals vs. fitted values
ggplot(sample_data, aes(x = fitted(model), y = residuals)) +
  geom_point(color = "blue", size = 0.5) +
  labs(x = "Fitted Values", y = "Residuals") +
  ggtitle("Residuals vs. Fitted Values")

# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)


```