# any text to the right of a hashtag is a comment N <- 8561 # population size n <- 112 # sample size s <- 4 # number of successes str( N ) # num 8561 print( theta.hat <- s / n ) # estimate of success probability 0.03571429 # initially, let's do some probability calculations, # in which we pretend we know the population # success probability theta (let's set it roughly # equal to theta.hat for maximum realism) and we compute how # random samples from this population would come out N * theta.hat 305.75 # so let's build a population with 306 successes (1s) # and ( N - 306 ) = 8255 failures (0s) in it population <- c( rep( 1, 306 ), rep( 0, 8255 ) ) length( population) 8561 M <- 1000000 # this is the number of simulation replications, # which is also the number of rows in the # repeated-sampling data set # the slow way to generate random samples # and take their means is with an explicit for loop: # to create replicability of random sampling results # you can set R's random number seed before sampling set.seed( 1 ) simulated.means.srs <- rep( NA, M ) # NA is the missing data symbol in R system.time( for ( i in 1:M ) { simulated.means.srs[ i ] <- mean( sample( population, n, replace = F ) ) } ) # user system elapsed (time in seconds) # 26.41 1.77 28.19 # when i ran this for-loop another time i got # user system elapsed # 24.89 6.26 32.38 print( theta.true <- mean( population ) ) 0.03574349 mean( simulated.means.srs ) 0.03576871 print( theoretical.se <- sqrt( theta.true * ( 1 - theta.true ) / n ) ) 0.01754227 sd( simulated.means.srs ) 0.01741586 hist( simulated.means.srs, breaks = 15, probability = T, main = '', xlab = 'Simulated Mean' ) plot( density( simulated.means.srs ), type = 'l', main = '', xlab = 'Simulated Mean' ) table( simulated.means.srs ) # 0 0.00892857142857143 0.0178571428571429 0.0267857142857143 # 16451 69004 144499 197452 # 0.0357142857142857 0.0446428571428571 0.0535714285714286 0.0625 # 200296 159959 106404 58995 # 0.0714285714285714 0.0803571428571429 0.0892857142857143 0.0982142857142857 # 28552 11817 4510 1433 # 0.107142857142857 0.116071428571429 0.125 0.133928571428571 # 466 127 31 3 # 0.142857142857143 # 1 ( 0: 16 ) / 112 # 0.000000000 0.008928571 0.017857143 0.026785714 0.035714286 0.044642857 # 0.053571429 0.062500000 0.071428571 0.080357143 0.089285714 0.098214286 # 0.107142857 0.116071429 0.125000000 0.133928571 0.142857143 # the only possible means are of the form ( # of 1s ) / n # and with n = 112 and M = 1,000,000 the only # sampled values of ( # of 1s ) are 0, 1, ..., 16 plot( density( simulated.means.srs, adjust = 2 ), type = 'l', main = '', xlab = 'Simulated Mean' ) plot( density( simulated.means.srs, adjust = 5 ), type = 'l', main = '', xlab = 'Simulated Mean' ) plot( density( simulated.means.srs, adjust = 10 ), type = 'l', main = '', xlab = 'Simulated Mean' ) hist( simulated.means.srs, breaks = 15, probability = T, main = '', xlab = 'Simulated Mean' ) simulated.mean.grid <- seq( 0, 0.14, length = 500 ) print( theoretical.mean <- 306 / 8561 ) 0.03574349 print( theoretical.sd <- sqrt( theoretical.mean * ( 1 - theoretical.mean ) / n ) ) 0.01754227 lines( simulated.mean.grid, dnorm( simulated.mean.grid, theoretical.mean, theoretical.sd ), lwd = 2, col = 'red' ) hist( simulated.means.srs, breaks = 15, probability = T, main = '', xlab = 'Simulated Mean', xlim = c( -0.03, 0.11 ), ylim = c( 0, 25 ) ) simulated.mean.grid <- seq( -0.03, 0.14, length = 500 ) lines( simulated.mean.grid, dnorm( simulated.mean.grid, theoretical.mean, theoretical.sd ), lwd = 2, col = 'red' ) # when the sampling is IID (at random with replacement), # there's a faster way to generate random samples # and take their means system.time( simulated.means.iid <- apply( matrix( sample( population, M * n, replace = T ), M, n ), 1, mean ) ) # user system elapsed # 20.53 0.64 21.55 # a second time i got # user system elapsed # 19.30 0.71 20.02 str( simulated.means.iid ) # num [1:1000000] 0.0179 0.0179 0.0268 0.0357 0.0714 ... par( mfrow = c( 2, 1 ) ) # plot with 2 rows and 1 column hist( simulated.means.srs, breaks = 15, probability = T, main = '', xlab = 'Simulated Mean (SRS)' ) hist( simulated.means.iid, breaks = 15, probability = T, main = '', xlab = 'Simulated Mean (IID)' ) par( mfrow = c( 1, 1 ) ) # plot with 1 row and 1 column hist( simulated.means.srs, breaks = 15, probability = T, main = '', xlab = 'Simulated Mean (SRS)' ) hist( simulated.means.iid, breaks = 15, probability = T, add = T, lty = 2 ) hist( simulated.means.srs, breaks = 15, probability = T, main = '', xlab = 'Simulated Mean (SRS)', col = 'black' ) hist( simulated.means.iid, breaks = 15, probability = T, add = T, col = 'red' ) # now run the basic modeling diagram in the other direction # to do large-sample frequentist statistical inference print( theta.hat <- s / n ) # estimate of success probability print( se <- sqrt( theta.hat * ( 1 - theta.hat ) / n ) ) qnorm( 0.025 ) -1.959964 qnorm( 0.975 ) 1.959964 print( confidence.interval.0.95 <- c( theta.hat - qnorm( 0.975 ) * se, theta.hat + qnorm( 0.975 ) * se ) ) # 0.00134560 0.07008297 # this is called a large-sample confidence interval # because it relies on the normal approximation # to the repeated-sampling (RS) distribution # of the sample mean, which we can see from # the plots above is not terrific with an n # of only 112 and a theta.hat close to 0 # creating mr. neyman's confidence interval diagram set.seed( 1 ) diagram.replications <- 100 diagram.results <- matrix( NA, diagram.replications, 2 ) par( mfrow = c( 1, 1 ) ) plot( 0, 1, type = 'n', xlim = c( -0.01, 0.13 ), ylim = c( 0, diagram.replications ), xlab = 'CI limits', ylab = 'Simulation Replication' ) abline( v = mean( population ), lwd = 2, col = 'red' ) abline( v = 0, lwd = 2, col = 'blue', lty = 2 ) for ( i in 1:diagram.replications ) { y.star <- sample( population, n, replace = T ) theta.hat <- mean( y.star ) se <- sqrt( theta.hat * ( 1 - theta.hat ) / n ) diagram.results[ i, ] <- c( theta.hat - qnorm( 0.975 ) * se, theta.hat + qnorm( 0.975 ) * se ) segments( diagram.results[ i, 1 ], i, diagram.results[ i, 2 ], i ) } mean( population ) 0.03574349 diagram.results [,1] [,2] [1,] -0.006669178 0.04238346 negative [2,] -0.006669178 0.04238346 negative [3,] -0.008492808 0.02634995 negative miss low [4,] -0.008492808 0.02634995 negative miss low [5,] 0.011870127 0.09527273 [6,] -0.006669178 0.04238346 negative [7,] 0.006395807 0.08288991 [8,] 0.017670414 0.10732959 [9,] 0.001345600 0.07008297 [10,] 0.011870127 0.09527273 [11,] -0.003115921 0.05668735 negative [12,] 0.006395807 0.08288991 [13,] 0.006395807 0.08288991 [14,] 0.001345600 0.07008297 [15,] 0.001345600 0.07008297 [16,] 0.001345600 0.07008297 [17,] 0.001345600 0.07008297 [18,] 0.017670414 0.10732959 [19,] 0.011870127 0.09527273 [20,] 0.011870127 0.09527273 [21,] 0.011870127 0.09527273 [22,] 0.001345600 0.07008297 [23,] -0.006669178 0.04238346 negative [24,] 0.001345600 0.07008297 [25,] -0.008492808 0.02634995 negative miss low [26,] 0.001345600 0.07008297 [27,] -0.006669178 0.04238346 negative [28,] 0.030011612 0.13070267 [29,] 0.011870127 0.09527273 [30,] 0.006395807 0.08288991 [31,] 0.023732488 0.11912466 [32,] 0.017670414 0.10732959 [33,] -0.008492808 0.02634995 negative miss low [34,] 0.023732488 0.11912466 [35,] 0.017670414 0.10732959 [36,] 0.011870127 0.09527273 [37,] 0.017670414 0.10732959 [38,] -0.003115921 0.05668735 negative [39,] -0.003115921 0.05668735 negative [40,] 0.001345600 0.07008297 [41,] 0.011870127 0.09527273 [42,] 0.000000000 0.00000000 miss low null [43,] -0.006669178 0.04238346 negative [44,] 0.006395807 0.08288991 [45,] 0.006395807 0.08288991 [46,] -0.003115921 0.05668735 negative [47,] 0.006395807 0.08288991 [48,] -0.008492808 0.02634995 negative miss low [49,] 0.001345600 0.07008297 [50,] 0.001345600 0.07008297 [51,] -0.003115921 0.05668735 negative [52,] 0.011870127 0.09527273 [53,] 0.011870127 0.09527273 [54,] 0.023732488 0.11912466 [55,] 0.000000000 0.00000000 miss low null [56,] -0.003115921 0.05668735 negative [57,] 0.011870127 0.09527273 [58,] -0.003115921 0.05668735 negative [59,] 0.001345600 0.07008297 [60,] -0.003115921 0.05668735 negative [61,] 0.006395807 0.08288991 [62,] -0.003115921 0.05668735 negative [63,] -0.003115921 0.05668735 negative [64,] 0.001345600 0.07008297 [65,] -0.008492808 0.02634995 negative miss low [66,] 0.006395807 0.08288991 [67,] 0.011870127 0.09527273 [68,] 0.006395807 0.08288991 [69,] 0.001345600 0.07008297 [70,] 0.001345600 0.07008297 [71,] -0.008492808 0.02634995 negative miss low [72,] 0.001345600 0.07008297 [73,] 0.011870127 0.09527273 [74,] -0.003115921 0.05668735 negative [75,] -0.003115921 0.05668735 negative [76,] 0.011870127 0.09527273 [77,] 0.006395807 0.08288991 [78,] -0.006669178 0.04238346 negative [79,] 0.006395807 0.08288991 [80,] 0.001345600 0.07008297 [81,] 0.011870127 0.09527273 [82,] 0.011870127 0.09527273 [83,] 0.000000000 0.00000000 miss low null [84,] -0.008492808 0.02634995 negative miss low [85,] 0.011870127 0.09527273 [86,] 0.006395807 0.08288991 [87,] -0.006669178 0.04238346 negative [88,] 0.006395807 0.08288991 [89,] -0.006669178 0.04238346 negative [90,] 0.006395807 0.08288991 [91,] 0.006395807 0.08288991 [92,] 0.023732488 0.11912466 [93,] -0.003115921 0.05668735 negative [94,] -0.006669178 0.04238346 negative [95,] -0.003115921 0.05668735 negative [96,] 0.001345600 0.07008297 [97,] 0.011870127 0.09527273 [98,] 0.011870127 0.09527273 [99,] 0.006395807 0.08288991 [100,] 0.011870127 0.09527273 mean( diagram.results[ , 1 ] < 0 ) 0.32 # it's embarrassing to have your confidence interval # for something that has to be non-negative go negative # 32% of the time mean( ( diagram.results[ , 1 ] < mean( population ) ) & ( diagram.results[ , 2 ] > mean( population ) ) ) 0.89 # these intervals were supposed to include the truth # 95% of the time, and they only did so 89% of the time # all of the misses were on the low side, # by which i mean that the interval lay # completely to the left of the truth mean( diagram.results[ , 1 ] == 0 ) 0.03 # 3% of the data sets were all 0s; # the large-sample confidence intervals # with such data sets are of the form ( 0, 0 ) # and so must fail to include the truth ############################################## # small-sample frequentist statistical inference # with the logit transform set.seed( 1 ) new.diagram.replications <- 100 new.diagram.results <- matrix( NA, new.diagram.replications, 2 ) par( mfrow = c( 1, 1 ) ) plot( 0, 1, type = 'n', xlim = c( -0.01, 0.15 ), ylim = c( 0, new.diagram.replications ), xlab = 'CI limits', ylab = 'Simulation Replication' ) abline( v = mean( population ), lwd = 2, col = 'red' ) abline( v = 0, lwd = 2, col = 'blue', lty = 2 ) for ( i in 1:new.diagram.replications ) { y.star <- sample( population, n, replace = T ) s.star <- sum( y.star ) if ( s.star == 0 ) print( i ) theta.hat.prime <- ( s.star + 0.5 ) / ( n + 1 ) eta.hat <- log( theta.hat.prime / ( 1 - theta.hat.prime ) ) se.eta.hat <- sqrt( 1 / ( n * theta.hat.prime * ( 1 - theta.hat.prime ) ) ) transformed.interval <- c( eta.hat - qnorm( 0.975 ) * se.eta.hat, eta.hat + qnorm( 0.975 ) * se.eta.hat ) new.diagram.results[ i, ] <- 1 / ( 1 + exp( - transformed.interval ) ) segments( new.diagram.results[ i, 1 ], i, new.diagram.results[ i, 2 ], i ) } [1] 42 [1] 55 [1] 83 # iteration numbers of data sets that were all 0s new.diagram.results [,1] [,2] [1,] 0.0063821818 0.07380851 [2,] 0.0063821818 0.07380851 [3,] 0.0026599939 0.06354513 [4,] 0.0026599939 0.06354513 [5,] 0.0268115849 0.11910406 [6,] 0.0063821818 0.07380851 [7,] 0.0211776579 0.10792804 [8,] 0.0326793420 0.13012783 [9,] 0.0158318350 0.09660132 [10,] 0.0268115849 0.11910406 [11,] 0.0108556084 0.08516410 [12,] 0.0211776579 0.10792804 [13,] 0.0211776579 0.10792804 [14,] 0.0158318350 0.09660132 [15,] 0.0158318350 0.09660132 [16,] 0.0158318350 0.09660132 [17,] 0.0158318350 0.09660132 [18,] 0.0326793420 0.13012783 [19,] 0.0268115849 0.11910406 [20,] 0.0268115849 0.11910406 [21,] 0.0268115849 0.11910406 [22,] 0.0158318350 0.09660132 [23,] 0.0063821818 0.07380851 [24,] 0.0158318350 0.09660132 [25,] 0.0026599939 0.06354513 [26,] 0.0158318350 0.09660132 [27,] 0.0063821818 0.07380851 [28,] 0.0449729355 0.15175774 miss high [29,] 0.0268115849 0.11910406 [30,] 0.0211776579 0.10792804 [31,] 0.0387425695 0.14100854 miss high [32,] 0.0326793420 0.13012783 [33,] 0.0026599939 0.06354513 [34,] 0.0387425695 0.14100854 miss high [35,] 0.0326793420 0.13012783 [36,] 0.0268115849 0.11910406 [37,] 0.0326793420 0.13012783 [38,] 0.0108556084 0.08516410 [39,] 0.0108556084 0.08516410 [40,] 0.0158318350 0.09660132 [41,] 0.0268115849 0.11910406 [42,] 0.0002728174 0.06749846 data set all 0s [43,] 0.0063821818 0.07380851 [44,] 0.0211776579 0.10792804 [45,] 0.0211776579 0.10792804 [46,] 0.0108556084 0.08516410 [47,] 0.0211776579 0.10792804 [48,] 0.0026599939 0.06354513 [49,] 0.0158318350 0.09660132 [50,] 0.0158318350 0.09660132 [51,] 0.0108556084 0.08516410 [52,] 0.0268115849 0.11910406 [53,] 0.0268115849 0.11910406 [54,] 0.0387425695 0.14100854 miss high [55,] 0.0002728174 0.06749846 data set all 0s [56,] 0.0108556084 0.08516410 [57,] 0.0268115849 0.11910406 [58,] 0.0108556084 0.08516410 [59,] 0.0158318350 0.09660132 [60,] 0.0108556084 0.08516410 [61,] 0.0211776579 0.10792804 [62,] 0.0108556084 0.08516410 [63,] 0.0108556084 0.08516410 [64,] 0.0158318350 0.09660132 [65,] 0.0026599939 0.06354513 [66,] 0.0211776579 0.10792804 [67,] 0.0268115849 0.11910406 [68,] 0.0211776579 0.10792804 [69,] 0.0158318350 0.09660132 [70,] 0.0158318350 0.09660132 [71,] 0.0026599939 0.06354513 [72,] 0.0158318350 0.09660132 [73,] 0.0268115849 0.11910406 [74,] 0.0108556084 0.08516410 [75,] 0.0108556084 0.08516410 [76,] 0.0268115849 0.11910406 [77,] 0.0211776579 0.10792804 [78,] 0.0063821818 0.07380851 [79,] 0.0211776579 0.10792804 [80,] 0.0158318350 0.09660132 [81,] 0.0268115849 0.11910406 [82,] 0.0268115849 0.11910406 [83,] 0.0002728174 0.06749846 data set all 0s [84,] 0.0026599939 0.06354513 [85,] 0.0268115849 0.11910406 [86,] 0.0211776579 0.10792804 [87,] 0.0063821818 0.07380851 [88,] 0.0211776579 0.10792804 [89,] 0.0063821818 0.07380851 [90,] 0.0211776579 0.10792804 [91,] 0.0211776579 0.10792804 [92,] 0.0387425695 0.14100854 miss high [93,] 0.0108556084 0.08516410 [94,] 0.0063821818 0.07380851 [95,] 0.0108556084 0.08516410 [96,] 0.0158318350 0.09660132 [97,] 0.0268115849 0.11910406 [98,] 0.0268115849 0.11910406 [99,] 0.0211776579 0.10792804 [100,] 0.0268115849 0.11910406 mean( new.diagram.results[ , 1 ] < 0 ) 0 # the negative left-hand limit problem has disappeared mean( ( new.diagram.results[ , 1 ] < mean( population ) ) & ( new.diagram.results[ , 2 ] > mean( population ) ) ) 0.95 # these new intervals were supposed to include the truth # 95% of the time, and they did so 95% of the time # interestingly, all of the misses are now on the high side # 3% of the data sets were all 0s; as we saw earlier, # the large-sample confidence intervals # with such data sets are of the form ( 0, 0 ) # and so must fail to include the truth, # but the small-sample intervals all get it right