# any text to the right of a hashtag is a comment

N <- 8561 # population size

n <- 112  # sample size

s <- 4    # number of successes

str( N )

# num 8561

print( theta.hat <- s / n ) # estimate of success probability

0.03571429

# initially, let's do some probability calculations,
# in which we pretend we know the population
# success probability theta (let's set it roughly
# equal to theta.hat for maximum realism) and we compute how
# random samples from this population would come out

N * theta.hat

305.75

# so let's build a population with 306 successes (1s)
# and ( N - 306 ) = 8255 failures (0s) in it

population <- c( rep( 1, 306 ), rep( 0, 8255 ) )

length( population)

8561

M <- 1000000

# this is the number of simulation replications,
# which is also the number of rows in the
# repeated-sampling data set

# the slow way to generate random samples
# and take their means is with an explicit for loop:

# to create replicability of random sampling results
# you can set R's random number seed before sampling

set.seed( 1 )

simulated.means.srs <- rep( NA, M )

# NA is the missing data symbol in R

system.time(

  for ( i in 1:M ) {

    simulated.means.srs[ i ] <- 
      mean( sample( population, n, replace = F ) )

  }

)

#  user  system elapsed (time in seconds)
# 26.41    1.77   28.19

# when i ran this for-loop another time i got

#  user  system elapsed 
# 24.89    6.26   32.38

print( theta.true <- mean( population ) )

0.03574349

mean( simulated.means.srs )

0.03576871

print( theoretical.se <- sqrt( theta.true * ( 1 - theta.true ) / n ) ) 

0.01754227

sd( simulated.means.srs )

0.01741586

hist( simulated.means.srs, breaks = 15, probability = T,
  main = '', xlab = 'Simulated Mean' )

plot( density( simulated.means.srs ), type = 'l',
  main = '', xlab = 'Simulated Mean' )

table( simulated.means.srs )

#                   0 0.00892857142857143  0.0178571428571429  0.0267857142857143 
#               16451               69004              144499              197452 

#  0.0357142857142857  0.0446428571428571  0.0535714285714286              0.0625 
#              200296              159959              106404               58995 

#  0.0714285714285714  0.0803571428571429  0.0892857142857143  0.0982142857142857 
#               28552               11817                4510                1433 

#   0.107142857142857   0.116071428571429               0.125   0.133928571428571 
#                 466                 127                  31                   3 

#   0.142857142857143 
#                   1 

( 0: 16 ) / 112

# 0.000000000 0.008928571 0.017857143 0.026785714 0.035714286 0.044642857
# 0.053571429 0.062500000 0.071428571 0.080357143 0.089285714 0.098214286
# 0.107142857 0.116071429 0.125000000 0.133928571 0.142857143

# the only possible means are of the form ( # of 1s ) / n

# and with n = 112 and M = 1,000,000 the only
# sampled values of ( # of 1s ) are 0, 1, ..., 16

plot( density( simulated.means.srs, adjust = 2 ), type = 'l',
  main = '', xlab = 'Simulated Mean' )

plot( density( simulated.means.srs, adjust = 5 ), type = 'l',
  main = '', xlab = 'Simulated Mean' )

plot( density( simulated.means.srs, adjust = 10 ), type = 'l',
  main = '', xlab = 'Simulated Mean' )

hist( simulated.means.srs, breaks = 15, probability = T,
  main = '', xlab = 'Simulated Mean' )

simulated.mean.grid <- seq( 0, 0.14, length = 500 )

print( theoretical.mean <- 306 / 8561 )

0.03574349

print( theoretical.sd <- sqrt( theoretical.mean *
  ( 1 - theoretical.mean ) / n ) )

0.01754227

lines( simulated.mean.grid, dnorm( simulated.mean.grid,
  theoretical.mean, theoretical.sd ), lwd = 2, col = 'red' )

hist( simulated.means.srs, breaks = 15, probability = T,
  main = '', xlab = 'Simulated Mean', xlim = c( -0.03, 0.11 ),
  ylim = c( 0, 25 ) )

simulated.mean.grid <- seq( -0.03, 0.14, length = 500 )

lines( simulated.mean.grid, dnorm( simulated.mean.grid,
  theoretical.mean, theoretical.sd ), lwd = 2, col = 'red' )

# when the sampling is IID (at random with replacement),
# there's a faster way to generate random samples
# and take their means

system.time(

  simulated.means.iid <- apply( matrix( 
    sample( population, M * n, replace = T ), M, n ), 1, mean )

)

#  user  system elapsed 
# 20.53    0.64   21.55 

# a second time i got

#  user  system elapsed 
# 19.30    0.71   20.02

str( simulated.means.iid )

#  num [1:1000000] 0.0179 0.0179 0.0268 0.0357 0.0714 ...

par( mfrow = c( 2, 1 ) ) # plot with 2 rows and 1 column

hist( simulated.means.srs, breaks = 15, probability = T,
  main = '', xlab = 'Simulated Mean (SRS)' )

hist( simulated.means.iid, breaks = 15, probability = T,
  main = '', xlab = 'Simulated Mean (IID)' )

par( mfrow = c( 1, 1 ) ) # plot with 1 row and 1 column

hist( simulated.means.srs, breaks = 15, probability = T,
  main = '', xlab = 'Simulated Mean (SRS)' )

hist( simulated.means.iid, breaks = 15, probability = T,
  add = T, lty = 2 )

hist( simulated.means.srs, breaks = 15, probability = T,
  main = '', xlab = 'Simulated Mean (SRS)', col = 'black' )

hist( simulated.means.iid, breaks = 15, probability = T,
  add = T, col = 'red' )

# now run the basic modeling diagram in the other direction
# to do large-sample frequentist statistical inference

print( theta.hat <- s / n ) # estimate of success probability

print( se <- sqrt( theta.hat * ( 1 - theta.hat ) / n ) )

qnorm( 0.025 )

-1.959964

qnorm( 0.975 )

1.959964

print( confidence.interval.0.95 <- c( theta.hat - qnorm( 0.975 ) * se,
  theta.hat + qnorm( 0.975 ) * se ) )

# 0.00134560 0.07008297

# this is called a large-sample confidence interval
# because it relies on the normal approximation
# to the repeated-sampling (RS) distribution
# of the sample mean, which we can see from
# the plots above is not terrific with an n
# of only 112 and a theta.hat close to 0

# creating mr. neyman's confidence interval diagram

set.seed( 1 )

diagram.replications <- 100

diagram.results <- matrix( NA, diagram.replications, 2 )

par( mfrow = c( 1, 1 ) )

plot( 0, 1, type = 'n', xlim = c( -0.01, 0.13 ),
  ylim = c( 0, diagram.replications ), xlab = 'CI limits',
  ylab = 'Simulation Replication' )

abline( v = mean( population ), lwd = 2, col = 'red' )

abline( v = 0, lwd = 2, col = 'blue', lty = 2 )

for ( i in 1:diagram.replications ) {

  y.star <- sample( population, n, replace = T )
  
  theta.hat <- mean( y.star )

  se <- sqrt( theta.hat * ( 1 - theta.hat ) / n )

  diagram.results[ i, ] <- c( theta.hat - qnorm( 0.975 ) * se,
    theta.hat + qnorm( 0.975 ) * se )

  segments( diagram.results[ i, 1 ], i, 
    diagram.results[ i, 2 ], i )

}

mean( population )

0.03574349

diagram.results

               [,1]       [,2]
  [1,] -0.006669178 0.04238346 negative
  [2,] -0.006669178 0.04238346 negative
  [3,] -0.008492808 0.02634995 negative miss low
  [4,] -0.008492808 0.02634995 negative miss low
  [5,]  0.011870127 0.09527273
  [6,] -0.006669178 0.04238346 negative
  [7,]  0.006395807 0.08288991
  [8,]  0.017670414 0.10732959
  [9,]  0.001345600 0.07008297
 [10,]  0.011870127 0.09527273
 [11,] -0.003115921 0.05668735 negative
 [12,]  0.006395807 0.08288991
 [13,]  0.006395807 0.08288991
 [14,]  0.001345600 0.07008297
 [15,]  0.001345600 0.07008297
 [16,]  0.001345600 0.07008297
 [17,]  0.001345600 0.07008297
 [18,]  0.017670414 0.10732959
 [19,]  0.011870127 0.09527273
 [20,]  0.011870127 0.09527273
 [21,]  0.011870127 0.09527273
 [22,]  0.001345600 0.07008297
 [23,] -0.006669178 0.04238346 negative
 [24,]  0.001345600 0.07008297
 [25,] -0.008492808 0.02634995 negative miss low
 [26,]  0.001345600 0.07008297
 [27,] -0.006669178 0.04238346 negative
 [28,]  0.030011612 0.13070267
 [29,]  0.011870127 0.09527273
 [30,]  0.006395807 0.08288991
 [31,]  0.023732488 0.11912466
 [32,]  0.017670414 0.10732959
 [33,] -0.008492808 0.02634995 negative miss low
 [34,]  0.023732488 0.11912466
 [35,]  0.017670414 0.10732959
 [36,]  0.011870127 0.09527273
 [37,]  0.017670414 0.10732959
 [38,] -0.003115921 0.05668735 negative
 [39,] -0.003115921 0.05668735 negative
 [40,]  0.001345600 0.07008297
 [41,]  0.011870127 0.09527273
 [42,]  0.000000000 0.00000000          miss low null
 [43,] -0.006669178 0.04238346 negative
 [44,]  0.006395807 0.08288991
 [45,]  0.006395807 0.08288991
 [46,] -0.003115921 0.05668735 negative
 [47,]  0.006395807 0.08288991
 [48,] -0.008492808 0.02634995 negative miss low
 [49,]  0.001345600 0.07008297
 [50,]  0.001345600 0.07008297
 [51,] -0.003115921 0.05668735 negative
 [52,]  0.011870127 0.09527273
 [53,]  0.011870127 0.09527273
 [54,]  0.023732488 0.11912466
 [55,]  0.000000000 0.00000000          miss low null
 [56,] -0.003115921 0.05668735 negative
 [57,]  0.011870127 0.09527273
 [58,] -0.003115921 0.05668735 negative
 [59,]  0.001345600 0.07008297
 [60,] -0.003115921 0.05668735 negative
 [61,]  0.006395807 0.08288991
 [62,] -0.003115921 0.05668735 negative
 [63,] -0.003115921 0.05668735 negative
 [64,]  0.001345600 0.07008297
 [65,] -0.008492808 0.02634995 negative miss low
 [66,]  0.006395807 0.08288991
 [67,]  0.011870127 0.09527273
 [68,]  0.006395807 0.08288991
 [69,]  0.001345600 0.07008297
 [70,]  0.001345600 0.07008297
 [71,] -0.008492808 0.02634995 negative miss low
 [72,]  0.001345600 0.07008297
 [73,]  0.011870127 0.09527273
 [74,] -0.003115921 0.05668735 negative
 [75,] -0.003115921 0.05668735 negative
 [76,]  0.011870127 0.09527273
 [77,]  0.006395807 0.08288991
 [78,] -0.006669178 0.04238346 negative
 [79,]  0.006395807 0.08288991
 [80,]  0.001345600 0.07008297
 [81,]  0.011870127 0.09527273
 [82,]  0.011870127 0.09527273
 [83,]  0.000000000 0.00000000          miss low null
 [84,] -0.008492808 0.02634995 negative miss low
 [85,]  0.011870127 0.09527273
 [86,]  0.006395807 0.08288991
 [87,] -0.006669178 0.04238346 negative
 [88,]  0.006395807 0.08288991
 [89,] -0.006669178 0.04238346 negative
 [90,]  0.006395807 0.08288991
 [91,]  0.006395807 0.08288991
 [92,]  0.023732488 0.11912466
 [93,] -0.003115921 0.05668735 negative
 [94,] -0.006669178 0.04238346 negative
 [95,] -0.003115921 0.05668735 negative
 [96,]  0.001345600 0.07008297
 [97,]  0.011870127 0.09527273
 [98,]  0.011870127 0.09527273
 [99,]  0.006395807 0.08288991
[100,]  0.011870127 0.09527273

mean( diagram.results[ , 1 ] < 0 )

0.32

# it's embarrassing to have your confidence interval
# for something that has to be non-negative go negative
# 32% of the time

mean( ( diagram.results[ , 1 ] < mean( population ) ) &
  ( diagram.results[ , 2 ] > mean( population ) ) )

0.89

# these intervals were supposed to include the truth
# 95% of the time, and they only did so 89% of the time

# all of the misses were on the low side,
# by which i mean that the interval lay
# completely to the left of the truth

mean( diagram.results[ , 1 ] == 0 )

0.03

# 3% of the data sets were all 0s;
# the large-sample confidence intervals 
# with such data sets are of the form ( 0, 0 )
# and so must fail to include the truth

##############################################

# small-sample frequentist statistical inference
# with the logit transform

set.seed( 1 )

new.diagram.replications <- 100

new.diagram.results <- matrix( NA, new.diagram.replications, 2 )

par( mfrow = c( 1, 1 ) )

plot( 0, 1, type = 'n', xlim = c( -0.01, 0.15 ),
  ylim = c( 0, new.diagram.replications ), xlab = 'CI limits',
  ylab = 'Simulation Replication' )

abline( v = mean( population ), lwd = 2, col = 'red' )

abline( v = 0, lwd = 2, col = 'blue', lty = 2 )

for ( i in 1:new.diagram.replications ) {

  y.star <- sample( population, n, replace = T )

  s.star <- sum( y.star )

  if ( s.star == 0 ) print( i )
  
  theta.hat.prime <- ( s.star + 0.5 ) / ( n + 1 )

  eta.hat <- log( theta.hat.prime / ( 1 - theta.hat.prime ) )

  se.eta.hat <- sqrt( 1 / ( n * theta.hat.prime * 
    ( 1 - theta.hat.prime ) ) )

  transformed.interval <- 
    c( eta.hat - qnorm( 0.975 ) * se.eta.hat,
    eta.hat + qnorm( 0.975 ) * se.eta.hat )

  new.diagram.results[ i, ] <- 1 / 
    ( 1 + exp( - transformed.interval ) )

  segments( new.diagram.results[ i, 1 ], i, 
    new.diagram.results[ i, 2 ], i )

}

[1] 42
[1] 55
[1] 83

# iteration numbers of data sets that were all 0s

new.diagram.results

               [,1]       [,2]
  [1,] 0.0063821818 0.07380851
  [2,] 0.0063821818 0.07380851
  [3,] 0.0026599939 0.06354513
  [4,] 0.0026599939 0.06354513
  [5,] 0.0268115849 0.11910406
  [6,] 0.0063821818 0.07380851
  [7,] 0.0211776579 0.10792804
  [8,] 0.0326793420 0.13012783
  [9,] 0.0158318350 0.09660132
 [10,] 0.0268115849 0.11910406
 [11,] 0.0108556084 0.08516410
 [12,] 0.0211776579 0.10792804
 [13,] 0.0211776579 0.10792804
 [14,] 0.0158318350 0.09660132
 [15,] 0.0158318350 0.09660132
 [16,] 0.0158318350 0.09660132
 [17,] 0.0158318350 0.09660132
 [18,] 0.0326793420 0.13012783
 [19,] 0.0268115849 0.11910406
 [20,] 0.0268115849 0.11910406
 [21,] 0.0268115849 0.11910406
 [22,] 0.0158318350 0.09660132
 [23,] 0.0063821818 0.07380851
 [24,] 0.0158318350 0.09660132
 [25,] 0.0026599939 0.06354513
 [26,] 0.0158318350 0.09660132
 [27,] 0.0063821818 0.07380851
 [28,] 0.0449729355 0.15175774 miss high
 [29,] 0.0268115849 0.11910406
 [30,] 0.0211776579 0.10792804
 [31,] 0.0387425695 0.14100854 miss high
 [32,] 0.0326793420 0.13012783
 [33,] 0.0026599939 0.06354513
 [34,] 0.0387425695 0.14100854 miss high
 [35,] 0.0326793420 0.13012783
 [36,] 0.0268115849 0.11910406
 [37,] 0.0326793420 0.13012783
 [38,] 0.0108556084 0.08516410
 [39,] 0.0108556084 0.08516410
 [40,] 0.0158318350 0.09660132
 [41,] 0.0268115849 0.11910406
 [42,] 0.0002728174 0.06749846           data set all 0s
 [43,] 0.0063821818 0.07380851
 [44,] 0.0211776579 0.10792804
 [45,] 0.0211776579 0.10792804
 [46,] 0.0108556084 0.08516410
 [47,] 0.0211776579 0.10792804
 [48,] 0.0026599939 0.06354513
 [49,] 0.0158318350 0.09660132
 [50,] 0.0158318350 0.09660132
 [51,] 0.0108556084 0.08516410
 [52,] 0.0268115849 0.11910406
 [53,] 0.0268115849 0.11910406
 [54,] 0.0387425695 0.14100854 miss high
 [55,] 0.0002728174 0.06749846           data set all 0s
 [56,] 0.0108556084 0.08516410
 [57,] 0.0268115849 0.11910406
 [58,] 0.0108556084 0.08516410
 [59,] 0.0158318350 0.09660132
 [60,] 0.0108556084 0.08516410
 [61,] 0.0211776579 0.10792804
 [62,] 0.0108556084 0.08516410
 [63,] 0.0108556084 0.08516410
 [64,] 0.0158318350 0.09660132
 [65,] 0.0026599939 0.06354513
 [66,] 0.0211776579 0.10792804
 [67,] 0.0268115849 0.11910406
 [68,] 0.0211776579 0.10792804
 [69,] 0.0158318350 0.09660132
 [70,] 0.0158318350 0.09660132
 [71,] 0.0026599939 0.06354513
 [72,] 0.0158318350 0.09660132
 [73,] 0.0268115849 0.11910406
 [74,] 0.0108556084 0.08516410
 [75,] 0.0108556084 0.08516410
 [76,] 0.0268115849 0.11910406
 [77,] 0.0211776579 0.10792804
 [78,] 0.0063821818 0.07380851
 [79,] 0.0211776579 0.10792804
 [80,] 0.0158318350 0.09660132
 [81,] 0.0268115849 0.11910406
 [82,] 0.0268115849 0.11910406
 [83,] 0.0002728174 0.06749846           data set all 0s
 [84,] 0.0026599939 0.06354513
 [85,] 0.0268115849 0.11910406
 [86,] 0.0211776579 0.10792804
 [87,] 0.0063821818 0.07380851
 [88,] 0.0211776579 0.10792804
 [89,] 0.0063821818 0.07380851
 [90,] 0.0211776579 0.10792804
 [91,] 0.0211776579 0.10792804
 [92,] 0.0387425695 0.14100854 miss high
 [93,] 0.0108556084 0.08516410
 [94,] 0.0063821818 0.07380851
 [95,] 0.0108556084 0.08516410
 [96,] 0.0158318350 0.09660132
 [97,] 0.0268115849 0.11910406
 [98,] 0.0268115849 0.11910406
 [99,] 0.0211776579 0.10792804
[100,] 0.0268115849 0.11910406

mean( new.diagram.results[ , 1 ] < 0 )

0

# the negative left-hand limit problem has disappeared

mean( ( new.diagram.results[ , 1 ] < mean( population ) ) &
  ( new.diagram.results[ , 2 ] > mean( population ) ) )

0.95

# these new intervals were supposed to include the truth
# 95% of the time, and they did so 95% of the time

# interestingly, all of the misses are now on the high side

# 3% of the data sets were all 0s; as we saw earlier,
# the large-sample confidence intervals 
# with such data sets are of the form ( 0, 0 )
# and so must fail to include the truth,
# but the small-sample intervals all get it right