pushing final oblig
This commit is contained in:
394
Oblig/3c/oblig3c_analysis.R
Normal file
394
Oblig/3c/oblig3c_analysis.R
Normal file
@@ -0,0 +1,394 @@
|
||||
# Oblig 3c: Linear regression with uncertainty
|
||||
|
||||
options(stringsAsFactors = FALSE)
|
||||
|
||||
script_path_arg <- grep("^--file=", commandArgs(trailingOnly = FALSE), value = TRUE)
|
||||
script_dir <- if (length(script_path_arg) > 0) {
|
||||
dirname(normalizePath(sub("^--file=", "", script_path_arg[1])))
|
||||
} else {
|
||||
getwd()
|
||||
}
|
||||
|
||||
output_dir <- file.path(script_dir, "output")
|
||||
if (!dir.exists(output_dir)) {
|
||||
dir.create(output_dir, recursive = TRUE)
|
||||
}
|
||||
|
||||
# -----------------------------
|
||||
# Generic helper functions
|
||||
# -----------------------------
|
||||
|
||||
strip_bom <- function(x) {
|
||||
sub("^\ufeff", "", x)
|
||||
}
|
||||
|
||||
standardize_name <- function(x) {
|
||||
cleaned <- tolower(strip_bom(iconv(x, to = "ASCII//TRANSLIT")))
|
||||
cleaned <- gsub("[^a-z0-9]+", "", cleaned)
|
||||
cleaned
|
||||
}
|
||||
|
||||
read_dice_file <- function(path) {
|
||||
raw_df <- read.csv(path, check.names = FALSE, fileEncoding = "UTF-8-BOM")
|
||||
raw_df <- raw_df[, colSums(!is.na(raw_df)) > 0, drop = FALSE]
|
||||
|
||||
original_names <- names(raw_df)
|
||||
clean_names <- vapply(original_names, standardize_name, character(1))
|
||||
|
||||
rename_map <- c(
|
||||
"k" = "k",
|
||||
"dropp" = "x",
|
||||
"dropphoyde" = "x",
|
||||
"x" = "x",
|
||||
"lengde" = "y",
|
||||
"sprettlengde" = "y",
|
||||
"y" = "y",
|
||||
"verdi" = "z",
|
||||
"terningverdi" = "z",
|
||||
"z" = "z",
|
||||
"tid" = "t",
|
||||
"t" = "t"
|
||||
)
|
||||
|
||||
mapped_names <- rename_map[clean_names]
|
||||
names(raw_df) <- ifelse(is.na(mapped_names), clean_names, mapped_names)
|
||||
|
||||
keep <- intersect(c("k", "x", "y", "z", "t"), names(raw_df))
|
||||
out <- raw_df[, keep, drop = FALSE]
|
||||
out$source_file <- basename(path)
|
||||
|
||||
for (name in setdiff(names(out), "source_file")) {
|
||||
out[[name]] <- suppressWarnings(as.numeric(out[[name]]))
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
read_all_dice_data <- function(folder) {
|
||||
files <- list.files(folder, pattern = "\\.csv$", full.names = TRUE)
|
||||
df_list <- lapply(files, read_dice_file)
|
||||
combined <- do.call(rbind, df_list)
|
||||
rownames(combined) <- NULL
|
||||
combined
|
||||
}
|
||||
|
||||
fit_simple_regression <- function(x, y, nu0 = -2, SS0 = 0) {
|
||||
stopifnot(length(x) == length(y))
|
||||
|
||||
n <- length(x)
|
||||
x_bar <- mean(x)
|
||||
x_centered <- x - x_bar
|
||||
SSx <- sum(x_centered^2)
|
||||
|
||||
beta_hat <- sum(x_centered * y) / SSx
|
||||
alpha_star <- mean(y)
|
||||
alpha0 <- alpha_star - beta_hat * x_bar
|
||||
|
||||
fitted <- alpha0 + beta_hat * x
|
||||
residuals <- y - fitted
|
||||
SSe <- sum(residuals^2)
|
||||
|
||||
nu1 <- nu0 + n
|
||||
SS1 <- SS0 + SSe
|
||||
s1 <- sqrt(SS1 / nu1)
|
||||
|
||||
list(
|
||||
n = n,
|
||||
x = x,
|
||||
y = y,
|
||||
x_bar = x_bar,
|
||||
SSx = SSx,
|
||||
alpha0 = alpha0,
|
||||
alpha_star = alpha_star,
|
||||
beta = beta_hat,
|
||||
fitted = fitted,
|
||||
residuals = residuals,
|
||||
SSe = SSe,
|
||||
nu0 = nu0,
|
||||
SS0 = SS0,
|
||||
nu1 = nu1,
|
||||
SS1 = SS1,
|
||||
s1 = s1
|
||||
)
|
||||
}
|
||||
|
||||
tau_posterior_parameters <- function(fit) {
|
||||
list(shape = fit$nu1 / 2, rate = fit$SS1 / 2)
|
||||
}
|
||||
|
||||
sigma_interval <- function(fit, level = 0.80) {
|
||||
alpha <- 1 - level
|
||||
lower_var <- fit$SS1 / qchisq(1 - alpha / 2, df = fit$nu1)
|
||||
upper_var <- fit$SS1 / qchisq(alpha / 2, df = fit$nu1)
|
||||
c(lower = sqrt(lower_var), upper = sqrt(upper_var))
|
||||
}
|
||||
|
||||
b_interval <- function(fit, level = 0.80) {
|
||||
alpha <- 1 - level
|
||||
t_crit <- qt(1 - alpha / 2, df = fit$nu1)
|
||||
margin <- t_crit * fit$s1 / sqrt(fit$SSx)
|
||||
c(lower = fit$beta - margin, upper = fit$beta + margin)
|
||||
}
|
||||
|
||||
y_posterior_parameters <- function(fit, x_new) {
|
||||
scale <- fit$s1 * sqrt(1 / fit$n + (x_new - fit$x_bar)^2 / fit$SSx)
|
||||
list(mean = fit$alpha0 + fit$beta * x_new, scale = scale, df = fit$nu1)
|
||||
}
|
||||
|
||||
y_predictive_parameters <- function(fit, x_new) {
|
||||
scale <- fit$s1 * sqrt(1 + 1 / fit$n + (x_new - fit$x_bar)^2 / fit$SSx)
|
||||
list(mean = fit$alpha0 + fit$beta * x_new, scale = scale, df = fit$nu1)
|
||||
}
|
||||
|
||||
t_interval <- function(mean, scale, df, level) {
|
||||
alpha <- 1 - level
|
||||
t_crit <- qt(1 - alpha / 2, df = df)
|
||||
c(lower = mean - t_crit * scale, upper = mean + t_crit * scale)
|
||||
}
|
||||
|
||||
credible_band <- function(fit, x_grid, level = 0.80) {
|
||||
alpha <- 1 - level
|
||||
t_crit <- qt(1 - alpha / 2, df = fit$nu1)
|
||||
mean_curve <- fit$alpha0 + fit$beta * x_grid
|
||||
margin <- t_crit * fit$s1 * sqrt(1 / fit$n + (x_grid - fit$x_bar)^2 / fit$SSx)
|
||||
data.frame(x = x_grid, mean = mean_curve, lower = mean_curve - margin, upper = mean_curve + margin)
|
||||
}
|
||||
|
||||
predictive_band <- function(fit, x_grid, level = 0.80) {
|
||||
alpha <- 1 - level
|
||||
t_crit <- qt(1 - alpha / 2, df = fit$nu1)
|
||||
mean_curve <- fit$alpha0 + fit$beta * x_grid
|
||||
margin <- t_crit * fit$s1 * sqrt(1 + 1 / fit$n + (x_grid - fit$x_bar)^2 / fit$SSx)
|
||||
data.frame(x = x_grid, mean = mean_curve, lower = mean_curve - margin, upper = mean_curve + margin)
|
||||
}
|
||||
|
||||
r_squared <- function(x, y) {
|
||||
fit <- lm(y ~ x)
|
||||
summary(fit)$r.squared
|
||||
}
|
||||
|
||||
print_regression_summary <- function(label, fit, x_eval = NULL, level_y = 0.80, level_pred = 0.80) {
|
||||
tau_post <- tau_posterior_parameters(fit)
|
||||
|
||||
cat("\n", label, "\n", sep = "")
|
||||
cat(strrep("-", nchar(label)), "\n", sep = "")
|
||||
cat("n =", fit$n, "\n")
|
||||
cat("x_bar =", fit$x_bar, "\n")
|
||||
cat("alpha0 =", fit$alpha0, "\n")
|
||||
cat("alpha* =", fit$alpha_star, "\n")
|
||||
cat("beta =", fit$beta, "\n")
|
||||
cat("SSe =", fit$SSe, "\n")
|
||||
cat("Posterior tau ~ Gamma(shape =", tau_post$shape, ", rate =", tau_post$rate, ")\n")
|
||||
|
||||
if (!is.null(x_eval)) {
|
||||
y_post <- y_posterior_parameters(fit, x_eval)
|
||||
y_pred <- y_predictive_parameters(fit, x_eval)
|
||||
|
||||
cat("Posterior y(", x_eval, ") ~ t(mean =", y_post$mean, ", scale =", y_post$scale, ", df =", y_post$df, ")\n", sep = "")
|
||||
cat("Predictive Y+(", x_eval, ") ~ t(mean =", y_pred$mean, ", scale =", y_pred$scale, ", df =", y_pred$df, ")\n", sep = "")
|
||||
cat(level_y * 100, "% credible interval for y(", x_eval, "): ", paste(round(t_interval(y_post$mean, y_post$scale, y_post$df, level_y), 4), collapse = " to "), "\n", sep = "")
|
||||
cat(level_pred * 100, "% predictive interval for Y+(", x_eval, "): ", paste(round(t_interval(y_pred$mean, y_pred$scale, y_pred$df, level_pred), 4), collapse = " to "), "\n", sep = "")
|
||||
}
|
||||
}
|
||||
|
||||
plot_regression_with_band <- function(df, fit, band_df, file_name, ylab, band_label) {
|
||||
png(file.path(output_dir, file_name), width = 1200, height = 900, res = 150)
|
||||
plot(df$x, df$y,
|
||||
pch = 19, col = "black",
|
||||
xlab = "x", ylab = ylab,
|
||||
main = band_label)
|
||||
lines(band_df$x, band_df$mean, col = "blue", lwd = 2)
|
||||
lines(band_df$x, band_df$lower, col = "red", lwd = 2, lty = 2)
|
||||
lines(band_df$x, band_df$upper, col = "red", lwd = 2, lty = 2)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
plot_regression_only <- function(df, fit, file_name, ylab, plot_title) {
|
||||
x_grid <- seq(min(df$x), max(df$x), length.out = 300)
|
||||
png(file.path(output_dir, file_name), width = 1200, height = 900, res = 150)
|
||||
plot(df$x, df$y,
|
||||
pch = 19, col = "black",
|
||||
xlab = "Dropphoyde", ylab = ylab,
|
||||
main = plot_title)
|
||||
lines(x_grid, fit$alpha0 + fit$beta * x_grid, col = "blue", lwd = 2)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
plot_many_sample_lines <- function(df, sample_size, rounds, file_name) {
|
||||
x_grid <- seq(min(df$x), max(df$x), length.out = 200)
|
||||
full_fit <- fit_simple_regression(df$x, df$y)
|
||||
|
||||
png(file.path(output_dir, file_name), width = 1200, height = 900, res = 150)
|
||||
plot(df$x, df$y,
|
||||
pch = 19, col = "grey60",
|
||||
xlab = "Dropphoyde", ylab = "Sprettlengde",
|
||||
main = paste("Regression lines from", rounds, "random samples of size", sample_size))
|
||||
|
||||
for (i in seq_len(rounds)) {
|
||||
sample_index <- sort(sample(seq_len(nrow(df)), sample_size))
|
||||
sample_fit <- fit_simple_regression(df$x[sample_index], df$y[sample_index])
|
||||
y_grid <- sample_fit$alpha0 + sample_fit$beta * x_grid
|
||||
lines(x_grid, y_grid, col = rgb(1, 0, 0, alpha = 0.18), lwd = 1)
|
||||
}
|
||||
|
||||
lines(x_grid, full_fit$alpha0 + full_fit$beta * x_grid, col = "blue", lwd = 3)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
plot_many_b_intervals <- function(df, sample_size, rounds, level = 0.80, file_name) {
|
||||
png(file.path(output_dir, file_name), width = 1200, height = 900, res = 150)
|
||||
plot(NA,
|
||||
xlim = c(0.5, rounds + 0.5),
|
||||
ylim = range(vapply(seq_len(rounds), function(i) {
|
||||
sample_index <- sort(sample(seq_len(nrow(df)), sample_size))
|
||||
fit <- fit_simple_regression(df$x[sample_index], df$y[sample_index])
|
||||
b_interval(fit, level)
|
||||
}, numeric(2))),
|
||||
xlab = "Runde", ylab = "Stigningstall b",
|
||||
main = paste(round(level * 100), "% credible intervals for b,", "N =", sample_size))
|
||||
|
||||
for (i in seq_len(rounds)) {
|
||||
sample_index <- sort(sample(seq_len(nrow(df)), sample_size))
|
||||
fit <- fit_simple_regression(df$x[sample_index], df$y[sample_index])
|
||||
interval <- b_interval(fit, level)
|
||||
segments(i, interval["lower"], i, interval["upper"], col = "red", lwd = 2)
|
||||
points(i, fit$beta, pch = 19, col = "blue")
|
||||
}
|
||||
|
||||
abline(h = fit_simple_regression(df$x, df$y)$beta, col = "darkgreen", lwd = 2, lty = 2)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
plot_many_credible_bands <- function(df, sample_size, rounds, level = 0.80, file_name) {
|
||||
x_grid <- seq(min(df$x), max(df$x), length.out = 200)
|
||||
|
||||
png(file.path(output_dir, file_name), width = 1200, height = 900, res = 150)
|
||||
plot(df$x, df$y,
|
||||
pch = 19, col = "grey70",
|
||||
xlab = "Dropphoyde", ylab = "Sprettlengde",
|
||||
main = paste(round(level * 100), "% credible bands for y(x), N =", sample_size))
|
||||
|
||||
for (i in seq_len(rounds)) {
|
||||
sample_index <- sort(sample(seq_len(nrow(df)), sample_size))
|
||||
fit <- fit_simple_regression(df$x[sample_index], df$y[sample_index])
|
||||
band <- credible_band(fit, x_grid, level)
|
||||
lines(band$x, band$lower, col = rgb(1, 0, 0, alpha = 0.12), lwd = 1)
|
||||
lines(band$x, band$upper, col = rgb(1, 0, 0, alpha = 0.12), lwd = 1)
|
||||
}
|
||||
|
||||
full_fit <- fit_simple_regression(df$x, df$y)
|
||||
lines(x_grid, full_fit$alpha0 + full_fit$beta * x_grid, col = "blue", lwd = 3)
|
||||
dev.off()
|
||||
}
|
||||
|
||||
# -----------------------------
|
||||
# Book task 17.1c
|
||||
# -----------------------------
|
||||
|
||||
task_1c <- data.frame(
|
||||
x = c(2, 3, 4, 6),
|
||||
y = c(10, 8, 8, 7)
|
||||
)
|
||||
|
||||
fit_1c <- fit_simple_regression(task_1c$x, task_1c$y, nu0 = 4 - 2, SS0 = 0.5^2 * (4 - 2))
|
||||
print_regression_summary(
|
||||
label = "Task 1: Chapter 17, problem 1c",
|
||||
fit = fit_1c,
|
||||
x_eval = mean(task_1c$x),
|
||||
level_y = 0.95,
|
||||
level_pred = 0.95
|
||||
)
|
||||
|
||||
# -----------------------------
|
||||
# Book task 17.1d
|
||||
# -----------------------------
|
||||
|
||||
task_1d <- data.frame(
|
||||
x = c(0, 1, 2, 3),
|
||||
y = c(0, 2, 7, 5)
|
||||
)
|
||||
|
||||
fit_1d <- fit_simple_regression(task_1d$x, task_1d$y, nu0 = -2, SS0 = 0)
|
||||
print_regression_summary(
|
||||
label = "Task 2: Chapter 17, problem 1d",
|
||||
fit = fit_1d,
|
||||
x_eval = mean(task_1d$x),
|
||||
level_y = 0.90,
|
||||
level_pred = 0.95
|
||||
)
|
||||
|
||||
# -----------------------------
|
||||
# Dice-drop assignment
|
||||
# -----------------------------
|
||||
|
||||
dice_df <- read_all_dice_data(file.path(script_dir, "terningDroppFiler"))
|
||||
dice_df <- dice_df[complete.cases(dice_df[, intersect(c("x", "y", "z"), names(dice_df)), drop = FALSE]), ]
|
||||
|
||||
dice_fit <- fit_simple_regression(dice_df$x, dice_df$y, nu0 = -2, SS0 = 0)
|
||||
x_grid <- seq(min(dice_df$x), max(dice_df$x), length.out = 300)
|
||||
cred_band_80 <- credible_band(dice_fit, x_grid, level = 0.80)
|
||||
pred_band_80 <- predictive_band(dice_fit, x_grid, level = 0.80)
|
||||
|
||||
cat("\nTask 3: Dice drop data\n")
|
||||
cat("----------------------\n")
|
||||
cat("Number of observations =", nrow(dice_df), "\n")
|
||||
cat("Regression line: y =", round(dice_fit$alpha0, 4), "+", round(dice_fit$beta, 4), "* x\n")
|
||||
cat("80% interval for b:", paste(round(b_interval(dice_fit, 0.80), 4), collapse = " to "), "\n")
|
||||
cat("80% interval for sigma:", paste(round(sigma_interval(dice_fit, 0.80), 4), collapse = " to "), "\n")
|
||||
cat("R^2 for y on x =", round(r_squared(dice_df$x, dice_df$y), 4), "\n")
|
||||
cat("R^2 for z on x =", round(r_squared(dice_df$x, dice_df$z), 4), "\n")
|
||||
|
||||
if ("t" %in% names(dice_df)) {
|
||||
cat("R^2 for t on x =", round(r_squared(dice_df$x, dice_df$t), 4), "\n")
|
||||
} else {
|
||||
cat("No time variable t was found in the CSV files, so task 3h can only be completed for z versus x with the current dataset.\n")
|
||||
}
|
||||
|
||||
plot_regression_with_band(
|
||||
df = dice_df,
|
||||
fit = dice_fit,
|
||||
band_df = cred_band_80,
|
||||
file_name = "task3_credible_band.png",
|
||||
ylab = "Sprettlengde",
|
||||
band_label = "Dice drop data with 80% credible band"
|
||||
)
|
||||
|
||||
plot_regression_only(
|
||||
df = dice_df,
|
||||
fit = dice_fit,
|
||||
file_name = "task3_scatter_regression.png",
|
||||
ylab = "Sprettlengde",
|
||||
plot_title = "Dice drop data with regression line"
|
||||
)
|
||||
|
||||
plot_regression_with_band(
|
||||
df = dice_df,
|
||||
fit = dice_fit,
|
||||
band_df = pred_band_80,
|
||||
file_name = "task3_predictive_band.png",
|
||||
ylab = "Sprettlengde",
|
||||
band_label = "Dice drop data with 80% predictive band"
|
||||
)
|
||||
|
||||
# -----------------------------
|
||||
# Repeated subsample experiments
|
||||
# -----------------------------
|
||||
|
||||
set.seed(1234)
|
||||
|
||||
plot_many_sample_lines(dice_df, sample_size = 5, rounds = 50, file_name = "task4a_lines_N5.png")
|
||||
plot_many_sample_lines(dice_df, sample_size = 15, rounds = 50, file_name = "task4b_lines_N15.png")
|
||||
plot_many_sample_lines(dice_df, sample_size = 50, rounds = 50, file_name = "task4b_lines_N50.png")
|
||||
plot_many_sample_lines(dice_df, sample_size = 200, rounds = 50, file_name = "task4b_lines_N200.png")
|
||||
|
||||
plot_many_b_intervals(dice_df, sample_size = 5, rounds = 50, level = 0.80, file_name = "task4c_b_intervals_N5.png")
|
||||
plot_many_b_intervals(dice_df, sample_size = 15, rounds = 50, level = 0.80, file_name = "task4d_b_intervals_N15.png")
|
||||
plot_many_b_intervals(dice_df, sample_size = 50, rounds = 50, level = 0.80, file_name = "task4d_b_intervals_N50.png")
|
||||
plot_many_b_intervals(dice_df, sample_size = 200, rounds = 50, level = 0.80, file_name = "task4d_b_intervals_N200.png")
|
||||
|
||||
plot_many_credible_bands(dice_df, sample_size = 5, rounds = 50, level = 0.80, file_name = "task4e_bands_N5.png")
|
||||
plot_many_credible_bands(dice_df, sample_size = 15, rounds = 50, level = 0.80, file_name = "task4e_bands_N15.png")
|
||||
plot_many_credible_bands(dice_df, sample_size = 50, rounds = 50, level = 0.80, file_name = "task4e_bands_N50.png")
|
||||
plot_many_credible_bands(dice_df, sample_size = 200, rounds = 50, level = 0.80, file_name = "task4e_bands_N200.png")
|
||||
|
||||
cat("\nTask 4 plots written to:", normalizePath(output_dir), "\n")
|
||||
Reference in New Issue
Block a user