9 Coding of Categorical and Continuous Predictors
Figues 9.1 to 9.6
GUSTO-I is a data set with patients suffering from an acute myocardial infarction, where we want to predict 30-day mortality. TBI is a data set with patients suffering from a moderate or severe traumatic brain injury.
Code
# Import gusto
<- read.csv("data/gusto_age_STE.csv")[, -1]
gusto
# Import sample4; n=785
<- read.csv("data/Gustos4Age.csv")
gustos
# Import TBI data; n=2159
<- read.csv("data/TBI2vars.csv") TBI
Fig 9.1: Age linear; add square; rcs in GUSTO-I
Code
<- lrm(DAY30 ~ AGE, data = gusto, x = T, y = T)
agegusto.linear <- lrm(DAY30 ~ pol(AGE, 2), data = gusto, x = T, y = T)
agegusto.square <- lrm(DAY30 ~ rcs(AGE, 5), data = gusto, x = T, y = T)
agegusto.rcs ## dichotomize
<- lrm(DAY30 ~ ifelse(AGE < 65, 0, 1), data = gusto, x = T, y = T)
agegusto.cat65 ## 3 categories
.3cat <- lrm(DAY30 ~ ifelse(AGE < 60, 0, ifelse(AGE < 70, 1, 2)), data = gusto, x = T, y = T)
agegusto
# Predict for age 20:95
<- data.frame("AGE" = seq(20, 95, by = 0.1))
newdata.age <- predict(agegusto.linear, newdata.age)
pred.agegusto.linear <- predict(agegusto.square, newdata.age)
pred.agegusto.square <- predict(agegusto.rcs, newdata.age)
pred.agegusto.rcs <- predict(agegusto.cat65, newdata.age)
pred.agegusto.cat65 .3cat <- predict(agegusto.3cat, newdata.age)
pred.agegusto
# Make plot
<- datadist(gusto)
dd options(datadist = "dd") # for rms
par(mfrow = c(1, 1))
plot(
x = newdata.age[, 1], y = pred.agegusto.linear, xlim = c(20, 92), ylim = c(-7, 0), las = 1, xaxt = "n",
xlab = "Age in years", ylab = "logit of 30-day mortality", cex.lab = 1.2, type = "l", lwd = 2, col = mycolors[2]
)axis(1, at = c(40, 50, 60, 65, 70, 80, 90))
lines(x = newdata.age[, 1], y = pred.agegusto.cat65, lty = 2, lwd = 2, col = mycolors[3])
lines(x = newdata.age[, 1], y = pred.agegusto.3cat, lty = 3, lwd = 2, col = mycolors[4])
scat1d(x = gusto$AGE, side = 1, frac = .05, col = "darkblue")
legend("topleft",
legend = c("linear", "<65 vs >=65", "<60, 60-69, 70+"),
lty = c(1, 2, 3), lwd = 2, cex = 1, bty = "n", col = mycolors[2:4]
)
Fig 9.2: Impact of number of ST elevations (STE)
Code
# winsorize at STE 10
$STE <- ifelse(gusto$STE > 10, 10, gusto$STE)
gusto$STE.f <- as.factor(gusto$STE)
gusto<- lrm(DAY30 ~ STE, data = gusto, x = T, y = T)
STE.linear <- lrm(DAY30 ~ pol(STE, 2), data = gusto, x = T, y = T)
STE.square <- lrm(DAY30 ~ rcs(STE, 5), data = gusto, x = T, y = T)
STE.rcs <- lrm(DAY30 ~ STE.f, data = gusto, x = T, y = T)
STE.factor
# dichotomize
<- lrm(DAY30 ~ ifelse(STE < 5, 0, 1), data = gusto, x = T, y = T)
STE.cat4
# predict for STE 0:10
<- data.frame("STE" = seq(0, 10, by = 1))
newdata.STE <- predict(STE.linear, newdata.STE)
pred.STE.linear <- predict(STE.square, newdata.STE)
pred.STE.square <- predict(STE.rcs, newdata.STE)
pred.STE.rcs <- predict(STE.cat4, newdata.STE)
pred.STE.cat4
par(mfrow = c(1, 1))
plot(
x = newdata.STE[, 1], y = pred.STE.linear, las = 1,
xlab = "Number of leads with ST elevation", ylab = "logit of 30-day mortality", cex.lab = 1.2, type = "l", lwd = 2, col = mycolors[2]
)lines(x = newdata.STE[, 1], y = pred.STE.square, lty = 2, lwd = 2, col = mycolors[3])
lines(x = newdata.STE[, 1], y = pred.STE.rcs, lty = 3, lwd = 3, col = mycolors[4])
lines(x = newdata.STE[1:5, 1], y = pred.STE.cat4[1:5], lty = 4, lwd = 2, col = mycolors[5])
lines(x = newdata.STE[6:11, 1], y = pred.STE.cat4[6:11], lty = 4, lwd = 2, col = mycolors[5])
# Original data points, with size proportional to sqrt(events)
<- log(by(gusto$DAY, gusto$STE, mean) / (1 - by(gusto$DAY, gusto$STE, mean)))[1:11]
STEmort <- sqrt(by(gusto$DAY, gusto$STE, sum)[1:11]) / 10
STEw points(x = newdata.STE[, 1], y = STEmort, pch = 1, lwd = 2, cex = STEw, col = "black")
scat1d(x = gusto$STE, side = 1, frac = .05, col = "darkblue")
legend("topleft",
legend = c("data", "linear", ".. +square", "rcs", "<=4, vs >4"),
lty = c(NA, 1, 2, 3, 4), pch = c(1, NA, NA, NA, NA), lwd = 2, cex = 1, bty = "n", col = mycolors[1:5]
)
Fig 9.3: Non-linearities in small sample (n=751); and full GUSTO-I (n=40,830)
Code
# Examine non-linearities in gustos sample
# Age
<- lrm(DAY30 ~ AGE, data = gustos, x = T, y = T, linear.predictors = F)
age.linear <- lrm(DAY30 ~ rcs(AGE, 5), data = gustos, x = T, y = T, linear.predictors = F)
age.rcs1 <- mfp(DAY30 ~ fp(AGE, df = 4), alpha = 1, data = gustos, family = binomial) # selected: -2 and 3
age.fp1 <- gam(DAY30 ~ s(AGE), data = gustos, family = binomial)
age.gam1
# examine predictions for age 20:95
<- matrix(nrow = 751, ncol = 5)
age.mat names(age.mat) <- list(NULL, Cs(AGE, linear, fp, rcs, gam))
<- seq(20, 95, by = 0.1)
AGE 1] <- AGE
age.mat[, 2] <- predict(age.linear, newdata = as.data.frame(x = AGE), type = "lp")
age.mat[, 3] <- predict(age.fp1, newdata = as.data.frame(x = AGE))
age.mat[, 4] <- predict(age.rcs1, newdata = as.data.frame(x = AGE), type = "lp")
age.mat[, 5] <- predict(age.gam1, newdata = as.data.frame(x = AGE))
age.mat[,
# Plot for n=785, Fig 9.3, part A #
par(mfrow = c(1, 2))
plot(
x = age.mat[, 1], y = age.mat[, 2], xlim = c(20, 92), ylim = c(-8, 0.2), las = 1, xaxt = "n",
xlab = "Age in years", ylab = "logit of 30-day mortality", cex.lab = 1.2, type = "l", lwd = 2, col = mycolors[2]
)axis(1, at = c(30, 40, 50, 60, 70, 80, 90))
lines(x = age.mat[, 1], y = age.mat[, 3], lty = 2, lwd = 2, col = mycolors[3])
lines(x = age.mat[, 1], y = age.mat[, 4], lty = 3, lwd = 3, col = mycolors[4])
lines(x = age.mat[, 1], y = age.mat[, 5], lty = 4, lwd = 3, col = mycolors[5])
histSpike(x = gustos$AGE, side = 1, frac = .1, col = "darkblue", add = T)
legend("topleft",
legend = c("linear", "fp", "rcs", "gam"),
lty = 1:4, lwd = 2, cex = 1.2, bty = "n", col = mycolors[2:5]
)title("GUSTO-I, n=785")
# End plot n=785
####################
## Now: N=40,830 ##
.2 <- lrm(DAY30 ~ AGE, data = gusto, x = T, y = T, linear.predictors = F)
age.linear.2 <- lrm(DAY30 ~ rcs(AGE, 5), data = gusto, x = T, y = T, linear.predictors = F)
age.rcs1.2 <- mfp(DAY30 ~ fp(AGE, df = 4), alpha = 1, data = gusto, family = binomial) # selected: -2 and 3
age.fp1.2 <- gam(DAY30 ~ s(AGE), data = gusto, family = binomial)
age.gam1
# examine predictions for age 20:95
.2 <- matrix(nrow = 751, ncol = 5)
age.matnames(age.mat.2) <- list(NULL, Cs(AGE, linear, fp, rcs, gam))
.2[, 1] <- AGE
age.mat.2[, 2] <- predict(age.linear.2, newdata = as.data.frame(x = AGE), type = "lp")
age.mat.2[, 3] <- predict(age.fp1.2, newdata = as.data.frame(x = AGE))
age.mat.2[, 4] <- predict(age.rcs1.2, newdata = as.data.frame(x = AGE), type = "lp")
age.mat.2[, 5] <- predict(age.gam1.2, newdata = as.data.frame(x = AGE))
age.mat
# Plot for n=40830
plot(
x = age.mat.2[, 1], y = age.mat.2[, 2], xlim = c(20, 92), ylim = c(-8, 0.2), las = 1, xaxt = "n",
xlab = "Age in years", ylab = "", cex.lab = 1.2, type = "l", lwd = 2, col = mycolors[2]
)axis(1, at = c(30, 40, 50, 60, 70, 80, 90))
lines(x = age.mat.2[, 1], y = age.mat.2[, 3], lty = 2, lwd = 2, col = mycolors[3])
lines(x = age.mat.2[, 1], y = age.mat.2[, 4], lty = 3, lwd = 3, col = mycolors[4])
lines(x = age.mat.2[, 1], y = age.mat.2[, 5], lty = 4, lwd = 3, col = mycolors[5])
histSpike(x = gusto$AGE, side = 1, frac = .2, col = "darkblue", add = T)
title("GUSTO-I, n=40,830")
Fig 9.4: Glucose and hb in TBI
We evaluate the predictors ‘glucose’ and ‘hemoglobin’ in TBI. Both are first truncated, after inspecting boxplots (Fig 9.4), and then analyzed for their prognostic value, with plots for illustration of the estimated relations (Fig 9.5).
Code
# glucose
quantile(TBI$glucose, probs = c(.005, .01, .02, .98, .99, .995), na.rm = T)
## 0.5% 1% 2% 98% 99% 99.5%
## 1.57 2.26 4.28 18.34 20.92 23.39
Code
# 0.5% 1% 2% 98% 99% 99.5%
# 1.57 2.26 4.28 18.34 20.92 23.39
# So, winsorize / truncate at 3 and 20
# use simple function to do the winsorizing:
# {ifelse(x<lower,upper, ifelse(x>upper,upper, x))}
<- function(x, lower = quantile(x, probs = 0.01), upper = quantile(x, probs = 0.99)) {
winsorize ifelse(x < lower, lower,
ifelse(x > upper, upper, x)
)
}$glucoset <- winsorize(TBI$glucose, 3, 20)
TBI
# systolic bp
quantile(TBI$hb, probs = c(.005, .01, .02, .98, .99, .995), na.rm = T)
## 0.5% 1% 2% 98% 99% 99.5%
## 4.87 5.40 6.30 16.50 16.80 17.00
Code
# 0.5% 1% 2% 98% 99% 99.5%
# 4.87 5.40 6.30 16.50 16.80 17.00
# So, winsorize / truncate at 6 and 17
$hbt <- winsorize(TBI$hb, 6, 17)
TBI
# boxplots for illustration
par(mfrow = c(1, 2))
boxplot(x = cbind(TBI$glucose, TBI$glucoset), outcol = c("red", mycolors[4]), border = mycolors[c(10, 4)], xaxt = "n", ylab = "glucose (mmol/L)")
axis(1, at = c(1, 2), labels = c("before ", "after winsorizing"))
title("Glucose")
boxplot(x = cbind(TBI$hb, TBI$hbt), outcol = c("red", mycolors[4]), border = mycolors[c(10, 4)], xaxt = "n", ylab = "hb (mmol/L)")
axis(1, at = c(1, 2), labels = c("before ", "after winsorizing"))
title("Hemoglobin")
Fig 9.6: Systolic blood pressure in TBI
We now examine the prognostic value of systolic blood pressure (BP) in TBI patients. We expect low BP (hypotension) to be especially risky.
Code
quantile(TBI$d.sysbpt, probs = c(.01, .25, .75, .99), na.rm = T)
## 1% 25% 75% 99%
## 92.9 121.1 142.1 171.8
Code
<- TBI[!is.na(TBI$d.sysbpt), ]
TBIs # 1% 25% 75% 99%
# 92.9 121.1 142.1 171.8
<- lrm(d.gos < 2 ~ rcs(d.sysbpt, 3), data = TBIs)
g1 <- lrm(d.gos < 3 ~ rcs(d.sysbpt, 3), data = TBIs)
g2 <- lrm(d.gos < 4 ~ rcs(d.sysbpt, 3), data = TBIs)
g3 <- lrm(d.gos < 5 ~ rcs(d.sysbpt, 3), data = TBIs)
g4
# Define categorical variants of systolic BP
$d.sysbpt.c <- as.factor(ifelse(TBIs$d.sysbpt < 120, 1, ifelse(TBIs$d.sysbpt > 150, 2, 0)))
TBIs
<- lrm(d.gos < 2 ~ d.sysbpt.c, data = TBIs)
g1t <- lrm(d.gos < 3 ~ d.sysbpt.c, data = TBIs)
g2t <- lrm(d.gos < 4 ~ d.sysbpt.c, data = TBIs)
g3t <- lrm(d.gos < 5 ~ d.sysbpt.c, data = TBIs)
g4t
<- datadist(TBIs)
dd options(datadist = "dd")
# Odds ratios
exp(coef(g1t)) # OR 2.78 for low BP and 1.25 for high BP
## Intercept d.sysbpt.c=1 d.sysbpt.c=2
## 0.226 2.777 1.245
Code
describe(TBI$d.sysbpt)
## TBI$d.sysbpt
## n missing distinct Info Mean Gmd .05 .10 .25 .50 .75
## 2159 0 1566 1 131.4 17.61 107.0 112.2 121.1 131.3 142.1
## .90 .95
## 151.1 156.2
##
## lowest : 60.0 64.2 65.6 71.3 73.7, highest: 182.9 184.2 184.7 188.3 207.4
Code
describe(TBIs$d.sysbpt.c)
## TBIs$d.sysbpt.c
## n missing distinct
## 2159 0 3
##
## Value 0 1 2
## Frequency 1435 474 250
## Proportion 0.665 0.220 0.116
Code
# data in matrix for plot
<- seq(60, 210, by = 1) # 151 elements
d.sysbpt <- matrix(nrow = 151, ncol = 5)
g.mat names(g.mat) <- list(NULL, Cs(d.sysbpt, g1, g2, g3, g4))
1] <- d.sysbpt
g.mat[, 2] <- predict(g1, newdata = as.data.frame(x = d.sysbpt))
g.mat[, 3] <- predict(g2, newdata = as.data.frame(x = d.sysbpt))
g.mat[, 4] <- predict(g3, newdata = as.data.frame(x = d.sysbpt))
g.mat[, 5] <- predict(g4, newdata = as.data.frame(x = d.sysbpt))
g.mat[,
# Plot: Fig 9.6, part I
par(mfrow = c(1, 2))
plot(
x = g.mat[, 1], y = g.mat[, 2], xlim = c(50, 210), ylim = c(-2.7, 2.7), las = 1, xaxt = "n",
xlab = "Systolic blood pressure (mmHg)", ylab = "logit GOS at 6 months", cex.lab = 1.2, type = "l", lwd = 2, col = mycolors[1]
)axis(1, at = c(100, 150, 200))
lines(x = g.mat[, 1], y = g.mat[, 3], lty = 2, lwd = 2, col = mycolors[2])
lines(x = g.mat[, 1], y = g.mat[, 4], lty = 3, lwd = 3, col = mycolors[3])
lines(x = g.mat[, 1], y = g.mat[, 5], lty = 4, lwd = 3, col = mycolors[4])
histSpike(x = TBI$d.sysbpt, side = 3, frac = .2, col = "darkblue", add = T)
legend("bottomleft", legend = c("<good recovery", "unfavorable", "mort+veg", "mortality"), lty = 4:1, lwd = 2, cex = 0.8, bty = "n", col = mycolors[4:1])
title("continuous, rcs 3 knots")
## Plot 9.6, part II
<- c(0, 1, 2)
d.sysbpt.c <- seq(60, 210, by = .1) # 1501 elements
d.sysbpt <- matrix(nrow = 1501, ncol = 5)
g.mat2 <- predict(g1t, newdata = as.data.frame(x = d.sysbpt.c))
g1t.c <- predict(g2t, newdata = as.data.frame(x = d.sysbpt.c))
g2t.c <- predict(g3t, newdata = as.data.frame(x = d.sysbpt.c))
g3t.c <- predict(g4t, newdata = as.data.frame(x = d.sysbpt.c))
g4t.c
names(g.mat2) <- list(NULL, Cs(d.sysbpt, g1, g2, g3, g4))
1] <- d.sysbpt
g.mat2[, 2] <- ifelse(d.sysbpt < 120, g1t.c[2], ifelse(d.sysbpt > 150, g1t.c[3], g1t.c[1]))
g.mat2[, 3] <- ifelse(d.sysbpt < 120, g2t.c[2], ifelse(d.sysbpt > 150, g2t.c[3], g2t.c[1]))
g.mat2[, 4] <- ifelse(d.sysbpt < 120, g3t.c[2], ifelse(d.sysbpt > 150, g3t.c[3], g3t.c[1]))
g.mat2[, 5] <- ifelse(d.sysbpt < 120, g4t.c[2], ifelse(d.sysbpt > 150, g4t.c[3], g4t.c[1]))
g.mat2[,
# Plot
plot(
x = g.mat2[, 1], y = g.mat2[, 2], xlim = c(50, 210), ylim = c(-2.7, 2.7), las = 1, xaxt = "n",
xlab = "Systolic blood pressure (mmHg)", ylab = "", cex.lab = 1.2, type = "l", lwd = 2, col = mycolors[1]
)axis(1, at = c(100, 150, 200))
lines(x = g.mat2[, 1], y = g.mat2[, 3], lty = 2, lwd = 2, col = mycolors[2])
lines(x = g.mat2[, 1], y = g.mat2[, 4], lty = 3, lwd = 3, col = mycolors[3])
lines(x = g.mat2[, 1], y = g.mat2[, 5], lty = 4, lwd = 3, col = mycolors[4])
histSpike(x = TBI$d.sysbpt, side = 3, frac = .2, col = "darkblue", add = T)
title("3 categories")