# Precompute sampling distributions for flight arrival delays
# This script generates sampling distributions for different sample sizes
# and saves them to an RDS file for use in the Shiny app

library(tidyverse)

# Set seed for reproducibility
set.seed(226)

# Load the data
flights_raw <- readRDS("../../data/flights_2024/flights_2024_top_15_raw.rds")

# Filter to valid flights (completed with delay data)
flights_valid <- flights_raw %>%
  filter(!is.na(arr_time) & !is.na(arr_delay))

# Population parameters
mu <- mean(flights_valid$arr_delay)
sigma <- sd(flights_valid$arr_delay)
n_population <- nrow(flights_valid)

cat(sprintf("Population mean (mu): %.2f minutes\n", mu))
cat(sprintf("Population standard deviation (sigma): %.2f minutes\n", sigma))
cat(sprintf("Population size: %d flights\n", n_population))

# Sample sizes to compute
sample_sizes <- c(10, 25, 50, 100, 500)
n_simulations <- 1000000

cat(sprintf("\nGenerating %d simulations for each sample size...\n", n_simulations))

# Store results
sampling_distributions <- list()

for (n_sample in sample_sizes) {
  cat(sprintf("Computing for n = %d... ", n_sample))

  # Efficient sampling: one big sample, then reshape
  all_samples <- sample(flights_valid$arr_delay,
                        size = n_simulations * n_sample,
                        replace = TRUE)
  sample_matrix <- matrix(all_samples, nrow = n_simulations, ncol = n_sample)
  sample_means <- rowMeans(sample_matrix)

  # Compute properties
  sampling_mean <- mean(sample_means)
  sampling_sd <- sd(sample_means)
  sampling_bias <- sampling_mean - mu
  theoretical_se <- sigma / sqrt(n_sample)

  # Store results
  sampling_distributions[[paste0("n_", n_sample)]] <- list(
    n = n_sample,
    sample_means = sample_means,
    sampling_mean = sampling_mean,
    sampling_sd = sampling_sd,
    sampling_bias = sampling_bias,
    theoretical_se = theoretical_se,
    mu = mu,
    sigma = sigma
  )

  cat(sprintf("Done. Mean = %.3f, SE = %.3f\n", sampling_mean, sampling_sd))
}

# Save to RDS file
output_file <- "sampling_distributions.rds"
saveRDS(sampling_distributions, output_file)

cat(sprintf("\nSampling distributions saved to: %s\n", output_file))
cat("Ready for Shiny app!\n")
