-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdistributionAnalysisFunction.R
More file actions
122 lines (108 loc) · 4.48 KB
/
distributionAnalysisFunction.R
File metadata and controls
122 lines (108 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# --------------------------------------------------------------
# ---------------- Distribution Analysis Function --------------
# --------------------------------------------------------------
distribution <- function(data, iterations = 1000, imputation = c("mean", "median", "none"), alpha = 0.05, baseR = FALSE){
library(PerformanceAnalytics)
library(ggplot2)
library(patchwork)
# -- Imputation --
if(imputation == "mean"){
mean <- mean(data, na.rm = TRUE)
}else if(imputation == "median"){
median <- median(data, na.rm = TRUE)
}else{
data <- na.omit(data)
}
df <- data.frame(data) # Create Dataframe for ggplot
# -- Monte Carlo Simulation for Sample Means --
means <- numeric(iterations)
stdev <- numeric(iterations)
sampleSizes <- length(data) * seq(0.05, 0.5, 0.05) # Sample Sizes
for(i in 1:iterations){
groups <- sample(x = data,
size = sample(sampleSizes, 1),
replace = TRUE)
means[i] <- mean(groups)
stdev[i] <- sd(groups)
}
df1 <- data.frame(means, stdev)
# Histogram of Means
pm <- ggplot(df1, aes(x = means)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = "Histogram of Sample Means", x = "Value", y = "Frequency") +
theme_minimal()
plot(pm) # Plot
#-------------------------------------------------------------------------
# ------------------------ Plots for ggplot ------------------------------
# ------------------------------------------------------------------------
if(baseR == FALSE){
# Histogram
p1 <- ggplot(df, aes(x = data)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = "Histogram", x = "Value", y = "Frequency") +
theme_minimal()
# KDE
p2 <- ggplot(df, aes(x = data)) +
geom_density(fill = "lightblue", color = "black", alpha = 0.6) +
labs(title = "Kernel Density Estimate", x = "Value", y = "Density") +
theme_minimal()
# Boxplot
p3 <- ggplot(df, aes(y = data)) +
geom_boxplot(fill = "lightblue", color = "black") +
labs(title = "Boxplot", y = "Value") +
theme_minimal()
# Boxplot Without Outliers
p4 <- ggplot(df, aes(y = data)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.shape = NA) +
labs(title = "Boxplot (No Outliers)", y = "Value") +
theme_minimal()
# QQ plot
p5 <- ggplot(df, aes(sample = data)) +
stat_qq(color = "lightblue") +
stat_qq_line(color = "black", linetype = "dashed") +
labs(title = "QQ Plot", x = "Theoretical Quantiles", y = "Sample Quantiles") +
theme_minimal()
# ECDF
p6 <- ggplot(df, aes(x = data)) +
stat_ecdf(geom = "step", color = "lightblue") +
labs(title = "Empirical CDF", x = "Value", y = "F(x)") +
theme_minimal()
# Arrange Plots
plot_grid <- (p1 | p2) /
(p3 | p4) /
(p5 | p6)
print(plot_grid)
}
# -----------------------------------------------------------------------
# ---------------------------- Plots for Base R -------------------------
# -----------------------------------------------------------------------
if(baseR == TRUE){
par(mfrow = c(2,2))
# Histogram
hist(data, main = "Histogram", col = "lightblue")
abline(v = mean(data), lty = 2, col = "red")
abline(v = median(data), lty = 2, col = "blue")
# CDF
plot(ecdf(data))
# Boxplot
boxplot(data, main = "Boxplot", col = "lightblue")
# Kernel Density
plot(density(data), main = "Kernel Density Function")
}
# SW Test
test <- shapiro.test(data)
# ----------------------------------------------------------------------
# ----------------------- Summary Statistics ---------------------------
# ----------------------------------------------------------------------
cat("----- Distribution Results -----", "\n",
"Mean: ", mean(data), "\n",
"Median: ", median(data), "\n",
"Skewness: ", skewness(data), "\n",
"Results of S-W Test: ", ifelse(test$p.value > 0.05, "Probably Normal","Probably not Normal"), "\n",
"Standard Deviation of Sample Means: ", sd(stdev), "\n",
"--------------------------------")
}
# --------------------------------------------
# ----------------- Demo ---------------------
# --------------------------------------------
#distribution(rnorm(200), imputation = "None")