5 Overfitting and Optimism in Prediction Models
Figures 5.2 to 5.5
Fig 5.2: Noise in estimating 10% mortality per center
Code
# Surg mortality; 10%
par(mfrow = c(1, 1), mar = c(5, 5, 1, 1))
for (mort in c(.1)) { ## ,0.05,0.02,.01)) { # 4 mortalities or only 1
plot(
x = seq(from = -.025, to = .975, by = .05), dbinom(x = 0:20, 20, mort), axes = F, type = "s", lwd = 2,
xlim = c(-.05, .35), ylim = c(0, .33), col = mycolors[2],
xlab = paste("Observed mortality, true mortality ", round(100 * mort, 0), "%", sep = ""), ylab = "probability density"
)axis(side = 1, at = c(0, .1, .2, .3), labels = c("0%", "10%", "20%", "30%"))
axis(side = 2, at = c(0, 0.1, .2, .3, .4, .5, .6, .7), labels = c("0%", "10%", "20%", "30%", "40%", "50%", "60%", "70%"))
text(x = mort, y = .02 + dbinom(x = max(round(mort * 20), 0), 20, mort), labels = paste("n=20"), col = mycolors[2])
for (i in c(50, 200)) { # add more sample sizes
lines(
x = seq(from = 0 - (0.5 * 1 / i), to = 1 - (0.5 * 1 / i), by = 1 / i), dbinom(x = 0:i, i, mort),
type = "s", lty = ifelse(i == 50, 2, 4), lwd = 2, col = mycolors[ifelse(i == 50, 3, 4)]
)text(x = mort, y = .02 + dbinom(x = max(round(mort * i), 0), i, mort), labels = paste("n=", i, sep = ""), col = mycolors[ifelse(i == 50, 3, 4)])
# end loop n=50,200
} # end loop mort }
Code
## End Fig 5.2 ##
Code
## function for Fig 5.3 and Fig 5.4: Noise vs Heterogeneity
<- function(n = 20, mort = 0.1, tau = c(0, .01, .02, .03)) {
illustrate_noise_heterogeneity par(mfrow = c(2, 2), pty = "m", mar = c(2.5, 4, 1.5, 1))
# Make data set with 100 centers, each 20 patients, 10% mortality, variability sd 0 to 0.03
<- 102
seedn set.seed(seedn)
<- 50
ncenter <- n # n can be changed
nsubjects # simple SD used on probability scale, can be improved upon
for (sdtau in tau) { # set for tau can be changed
<- rnorm(n = ncenter, mean = mort, sd = sdtau) # mort can be changed
truemort <- as.matrix(cbind(1:ncenter, sapply(truemort, FUN = function(x) rbinom(n = 1, nsubjects, x)) / nsubjects, truemort))
mortmat
# Start plotting
plot(x = 0, y = 0, pch = "", xlim = c(-.2, 1.2), ylim = c(-.03, .35), axes = F, xlab = "", ylab = ifelse(sdtau == 0 | sdtau == .02, "Mortality", ""))
axis(side = 2, at = c(0, .1, .2, .3), labels = c("0%", "10%", "20%", "30%"), las = 1)
axis(side = 1, at = c(0, 1), labels = c("Observed", "True mortality"))
text(x = 1, y = .3, ifelse(sdtau == 0, "No heterogeneity",
ifelse(sdtau != 0, paste("+/-", sdtau))
cex = 1, adj = 1, font = 2)
), for (i in (1:ncenter)) {
set.seed(i + seedn)
lines(
x = c(0 + runif(1, min = -.07, max = .07), 1),
y = c(mortmat[i, 2] + runif(1, min = -.001, max = .01), mortmat[i, 3]), col = mycolors[rep(1:10, 10)[i]]
)set.seed(i + seedn)
points(
x = c(0 + runif(1, min = -.07, max = .07), 1),
y = c(mortmat[i, 2] + runif(1, min = -.001, max = .01), mortmat[i, 3]), pch = c("o", "+"), col = mycolors[rep(1:10, 10)[i]]
)
}
}# end function that illustrates the impact of noise (determined by n) vs heterogeneity (determined by sdtau) }
Figs 5.3 and 5.4
These plots llustrate the impact of noise (determined by n, 20 or 200) vs heterogeneity (determined by sdtau (0 - 0.03)). With small n, such as n=20 per center, mortality such as 10% cannot be estimated reliably. Reliable estimation of a center’s performance requires a large n, such as n=200.