-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatistical_analysis_cvd_project.R
96 lines (68 loc) · 2.89 KB
/
statistical_analysis_cvd_project.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Find the path of working directory
getwd()
# Set the path of working directory to Cardiohealthpython
setwd("C:\\Users\\User\\OneDrive\\Documents\\Cardiohealthpython")
dir()
# Load necessary libraries
library(dplyr)
library(ggplot2)
library(stats)
# Unmask packages
stats::filter
stats::lag
base::intersect
base::setdiff
base::setequal
base::union
conflicts()
cvd_dataset <- read.csv("updated_cardio_health_multi1.csv")
head(cvd_dataset)
# Create a data frame from cvd_dataset
df <- data.frame(cvd_dataset)
# 1. Do CVD patients and people without CVD differ in their lifestyle habits (smoking, drinking, activity), glucose levels, and cholesterol levels?
# a. Smoking
table(df$cardio_condition, df$smoker)
chisq.test(table(df$cardio_condition, df$smoker))
# b. Drinking
table(df$cardio_condition, df$alcohol)
chisq.test(table(df$cardio_condition, df$alcohol))
# c. Activity
table(df$cardio_condition, df$active)
chisq.test(table(df$cardio_condition, df$active))
# d. Glucose Levels
table(df$cardio_condition, df$glucose_levels)
chisq.test(table(df$cardio_condition, df$glucose_levels))
# e. Cholesterol Levels
table(df$cardio_condition, df$cholesterol_levels)
chisq.test(table(df$cardio_condition, df$cholesterol_levels))
# 2. Is there is a significant difference between the lifestyle habits?
# a. Smoking vs. Drinking
table(df$smoker, df$alcohol)
chisq.test(table(df$smoker, df$alcohol))
# b. Smoking vs. Activity
table(df$smoker, df$active)
chisq.test(table(df$smoker, df$active))
# c. Drinking vs. Activity
table(df$alcohol, df$active)
chisq.test(table(df$alcohol, df$active))
# 3. Do CVD patients differ in body_mass_index from people without CVD?
t.test(body_mass_index ~ cardio_condition, data = df)
# Visualizations
# Create boxplots for body_mass_index vs cardio_condition
ggplot(df, aes(x = cardio_condition, y = body_mass_index, fill = cardio_condition)) +
geom_boxplot() +
labs(title = "BMI by CVD Status", x = "Presence of CVD", y = "Body Mass Index")
# Create bar plots for categorical variables (cardio_condition vs smoker)
ggplot(df, aes(x = cardio_condition, fill = smoker)) +
geom_bar(position = "fill") +
labs(title = "Smoking by CVD Status", x = "Presence of CVD", y = "Proportion", fill = "Smoker")
# Create bar plots for categorical variables (cardio_condition vs alcohol)
ggplot(df, aes(x = cardio_condition, fill = alcohol)) +
geom_bar(position = "fill") +
labs(title = "Alcohol Consumption by CVD Status", x = "Presence of CVD", y = "Proportion", fill = "Alcohol")
# Create bar plots for categorical variables (cardio_condition vs active)
ggplot(df, aes(x = cardio_condition, fill = active)) +
geom_bar(position = "fill") +
labs(title = "Physical Activity by CVD Status", x = "Presence of CVD", y = "Proportion", fill = "Physical Activity")
# Interpretation
# p-valueless than 0.05 indicates statistically significant difference between the groups.