Causal Data Science for Business Analytics
Hamburg University of Technology
Monday, 24. June 2024
Conditional Independence / Unconfoundedness
: assumption is not testable.\[ \begin{align*} \tau_{\text{ATE}} &= \mathbb{E}[Y_i(1)] - \mathbb{E}[Y_i(0)] \\ &= \mathbb{E}_{\mathbf{X, U}}[\mathbb{E}[Y_i|T_i=1, \mathbf{X_i, U_i}] - \mathbb{E}[Y_i|T_i=0, \mathbf{X_i, U_i}]] \\ & \color{#FF7E15}{\stackrel{?}{\approx}} \mathbb{E}_{\mathbf{X}}[\mathbb{E}[Y_i|T_i=1, \mathbf{X_i}] - \mathbb{E}[Y_i|T_i=0, \mathbf{X_i}]] \end{align*} \]
point estimate
of the ATE.Partial Identification
is the method to estimate the ATE under weaker assumptions yielding a set estimate
- an interval with upper and lower bounds.observational
-counterfactual decomposition of the ATE: \[
\begin{align*}
\mathbb{E}[Y_i(1) - Y_i(0)] &= \mathbb{E}[Y_i(1)] - \mathbb{E}[Y_i(0)] \\
&= P(T_i=1)\color{#00C1D4}{\mathbb{E}[Y_i(1)|T_i=1]} + P(T_1=0)\color{#FF7E15}{\mathbb{E}[Y_i(1)|T_i=0]} - P(T_i=1)\color{#FF7E15}{\mathbb{E}[Y_i(0)|T_i=1]} - P(T_i=0)\color{#00C1D4}{\mathbb{E}[Y_i(0)|T_i=0]}\\
&= P(T_i=1)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + P(T_i=0)\color{#FF7E15}{\mathbb{E}[Y_i(1)|T_i=0]} - P(T_i=1)\color{#FF7E15}{\mathbb{E}[Y_i(0)|T_i=1]} - P(T_i=0)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\
&:= p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#FF7E15}{\mathbb{E}[Y_i(1)|T_i=0]} - p\color{#FF7E15}{\mathbb{E}[Y_i(0)|T_i=1]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\
\end{align*}
\]observational
-counterfactual decomposition to derive a lower bound of the ATE: \[
\begin{align*}
\mathbb{E}[Y_i(1) - Y_i(0)] &= p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#FF7E15}{\mathbb{E}[Y_i(1)|T_i=0]} - p\color{#FF7E15}{\mathbb{E}[Y_i(0)|T_i=1]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\
& \geq p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#FF7E15}{\mathbb{E}[Y_i|T_i=0]} - p\color{#FF7E15}{\mathbb{E}[Y_i|T_i=1]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]} \\
&= 0
\end{align*}
\]observational
-counterfactual decomposition of the ATE and replace: \[
\begin{align*}
\mathbb{E}[Y_i(1) - Y_i(0)] &= p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#FF7E15}{\mathbb{E}[Y_i(1)|T_i=0]} - p\color{#FF7E15}{\mathbb{E}[Y_i(0)|T_i=1]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\
&\leq p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#FF7E15}{\mathbb{E}[Y_i(1)|T_i=1]} - p\color{#FF7E15}{\mathbb{E}[Y_i(0)|T_i=0]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\
&= p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} - p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\
&= \color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} - \color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\
\end{align*}
\]\[ \begin{align*} \mathbb{E}[Y_i(1) - Y_i(0)] &= p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#FF7E15}{\mathbb{E}[Y_i(1)|T_i=0]} - p\color{#FF7E15}{\mathbb{E}[Y_i(0)|T_i=1]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\ &\leq p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} - p\color{#00C1D4}{y^{LB}} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\ &= \color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} - p\color{#00C1D4}{y^{LB}} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\ \end{align*} \] \[ \begin{align*} \mathbb{E}[Y_i(1) - Y_i(0)] &\geq p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#00C1D4}{y^{LB}} - p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]} - (1-p)\color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\ &= p\color{#00C1D4}{\mathbb{E}[Y_i|T_i=1]} + (1-p)\color{#00C1D4}{y^{LB}} - \color{#00C1D4}{\mathbb{E}[Y_i|T_i=0]}\\ \end{align*} \]
library(hdm) # for the data
library(drgee) # for doubly robust estimator
data(pension) # Get data
Y = pension$net_tfa # Outcome
T = pension$p401 # Treatment
X = cbind(pension$age,pension$db,pension$educ,pension$fsize,pension$hown,
pension$inc,pension$male,pension$marr,pension$pira,pension$twoearn) # covariates
dr = drgee(oformula = formula(Y ~ X), eformula = formula(T ~ X), elink="logit") # DR reg
ATE <- as.numeric(dr$coefficients) # ATE
p = mean(T) # Propensity score
ymin = as.numeric(quantile(Y, probs = 0.05)) # outcome lower bound
ymax = as.numeric(quantile(Y, probs = 0.95)) # outcome upper bound
Y1 = mean(Y[T == 1]) # outcome mean for treated
Y0 = mean(Y[T == 0]) # outcome mean for untreated
# No assumption (worst case) bounds
UB = p*Y1 + (1-p)*ymax-p*ymin-(1-p)*Y0
LB = p*Y1 + (1-p)*ymin-p*ymax-(1-p)*Y0
L = UB - LB
cat(sprintf("LowerBound (worst) = %d, ATE = %d, UpperBound (worst) = %d, IntervalLength = %d", round(LB), round(ATE), round(UB), round(L)))
# Monotone Treatment Response (MTR) Bounds
UB = p*Y1 + (1-p)*ymax-p*ymin-(1-p)*Y0
LB = 0
L = UB - LB
cat(sprintf("LowerBound (MTR) = %d, ATE = %d, UpperBound (worst) = %d, IntervalLength = %d", round(LB), round(ATE), round(UB), round(L)))
# Monotone Treatment Selection (MTS) Bounds
UB = Y1 - Y0
L = UB - LB
cat(sprintf("LowerBound (MTR) = %d, ATE = %d, UpperBound (MTS) = %d, IntervalLength = %d", round(LB), round(ATE), round(UB), round(L)))
# Optimal Treatment Selection 1 (OTS 1) Bounds
UB = p*Y1 - p*ymin
LB = (1-p)*ymin - (1-p)*Y0
L = UB - LB
cat(sprintf("LowerBound (OTS 1) = %d, ATE = %d, UpperBound (OTS 1) = %d, IntervalLength = %d", round(LB), round(ATE), round(UB), round(L)))
# Optimal Treatment Selection 2 (OTS 2) Bounds
UB = Y1 - p*ymin - (1-p)*Y0
LB = p*Y1 + (1-p)*ymin - Y0
L = UB - LB
cat(sprintf("LowerBound (OTS 2) = %d, ATE = %d, UpperBound (OTS 2) = %d, IntervalLength = %d", round(LB), round(ATE), round(UB), round(L)))
# Mix OTS1 (Upper) and OTS 2 (Lower) Bounds
UB = p*Y1 - p*ymin
LB = p*Y1 + (1-p)*ymin - Y0
L = UB - LB
cat(sprintf("LowerBound (OTS 2) = %d, ATE = %d, UpperBound (OTS 1) = %d, IntervalLength = %d", round(LB), round(ATE), round(UB), round(L)))
LowerBound (worst) = -28746, ATE = 11333, UpperBound (worst) = 72253, IntervalLength = 100999
LowerBound (MTR) = 0, ATE = 11333, UpperBound (worst) = 72253, IntervalLength = 72253
LowerBound (MTR) = 0, ATE = 11333, UpperBound (MTS) = 27372, IntervalLength = 27372
LowerBound (OTS 1) = -14687, ATE = 11333, UpperBound (OTS 1) = 12365, IntervalLength = 27052
LowerBound (OTS 2) = -7526, ATE = 11333, UpperBound (OTS 2) = 32575, IntervalLength = 40101
LowerBound (OTS 2) = -7526, ATE = 11333, UpperBound (OTS 1) = 12365, IntervalLength = 19890
Frisch-Waugh-Lovell theorem
to the above models to partial out the observed covariates \(\mathbf{X_i}\):
\[ \begin{align*} \tilde{\tau} &= \frac{Cov((Y_i - \mathbb{E}(Y_i|\mathbf{X_i}), (T_i - \mathbb{E}(T_i|\mathbf{X_i})))}{Var((T_i - \mathbb{E}(T_i|\mathbf{X_i})))} = \frac{Cov((\tau(T_i - \mathbb{E}(T_i|\mathbf{X_i})) + \gamma(U_i - \mathbb{E}(U_i|\mathbf{X_i})) + \epsilon_{Y_i}), (T_i - \mathbb{E}(T_i|\mathbf{X_i})))}{Var((T_i - \mathbb{E}(T_i|\mathbf{X_i})))} \\ &= \tau \underbrace{\frac{ Cov((T_i - \mathbb{E}(T_i|\mathbf{X_i})),(T_i - \mathbb{E}(T_i|\mathbf{X_i})))}{Var((T_i - \mathbb{E}(T_i|\mathbf{X_i})))}}_{=1} + \gamma \underbrace{\frac{ Cov((U_i - \mathbb{E}(U_i|\mathbf{X_i})),(T_i - \mathbb{E}(T_i|\mathbf{X_i})))}{Var((T_i - \mathbb{E}(T_i|\mathbf{X_i})))}}_{:=\delta} + \underbrace{\frac{ Cov(\epsilon_{Y_i},(T_i - \mathbb{E}(T_i|\mathbf{X_i})))}{Var((T_i - \mathbb{E}(T_i|\mathbf{X_i})))}}_{=0}= \tau + \color{#FF7E15}{\underbrace{\gamma \delta}_{\text{Bias}}} \end{align*} \]
impact
of the unobserved confounder \(U_i\) on the outcome \(Y_i\).imbalance
in the unobserved confounder \(U_i\) across values of \(T_i\).impact on the outcome times its imbalance across treatment levels
.Question
: How strong does the bias of an unobserved confounder have to be to invalidate the treatment effect estimate?contour plot
of the bias \(\gamma \delta\) as a function of \(\gamma\) and \(\delta\).Hypothetical example
: estimated treatment effect unadjusted for the unobserved confounder \(\tilde{\tau} = 25\).
Levels of bias (contours) diminish the estimated \(\tilde{\tau}\) implying different levels of \(\tau\).
Benchmark covariates \(X_b\) for comparison.
scale-free
partial \(R^2\) measures:
Robustness Value (RV)
: minimum strength of association that \(\mathbf{U_i}\) must have with both \(T_i\) and \(Y_i\) to explain away the estimated \(\tilde{\tau}\).Key Advantages:
No assumptions on functional form
of the treatment mechanism or the distribution of unobserved confounders.
Handles multiple confounders
that may interact with the treatment and outcome in non-linear ways.
Benchmark the strength of confounders based on comparisons with observed covariates
.
Implementation: R package “sensemakr”.
Riesz representers
to derive influence functions for causal parameters in non-parametric settings.Both treatment mechanism and outcome mechanism can be modeled with arbitrary machine learning models
.library(hdm) # for the data
library(sensemakr) # load sensemakr package
data(pension) # Get data
# runs conditional outcome regression model
model <- lm(net_tfa ~ p401 + age + db + educ + fsize + hown + inc +
male + marr + pira + twoearn, data = pension)
# runs sensemakr for sensitivity analysis
sensitivity <- sensemakr(model = model, treatment = "p401",
benchmark_covariates = c("inc"), kd=1:3)
# plot
# plot(sensitivity)
# short description of results
sensitivity
Sensitivity Analysis to Unobserved Confounding
Model Formula: net_tfa ~ p401 + age + db + educ + fsize + hown + inc + male +
marr + pira + twoearn
Null hypothesis: q = 1 and reduce = TRUE
Unadjusted Estimates of ' p401 ':
Coef. estimate: 11590.38
Standard Error: 1345.253
t-value: 8.61577
Sensitivity Statistics:
Partial R2 of treatment with outcome: 0.00744
Robustness Value, q = 1 : 0.08291
Robustness Value, q = 1 alpha = 0.05 : 0.06468
For more information, check summary.
Assumptions:
Parametric assumption
(i.e. linearity):
homogeneous treatment effect
.monotonicity assumption
:
Local Average Treatment Effect (LATE)
instead of ATE.Lotteries
- purely random:
Encouragement designs
: random assignment of invitations or incentives to participate in a program.Natural Experiments
- as-good-as-random:
Wald Estimand
for the treatment effect in this IV setting in the following steps:
instrumental unconfoundedness
assumption - rearrange.instrumental unconfoundedness
assumption again.\[ \begin{align*} \mathbb{E}[Y_i | Z_i = 1] &- \mathbb{E}[Y_i | Z_i = 0] = \mathbb{E}[\beta_0 + \tau T_i + \beta_U U_i + \epsilon_i | Z_i = 1] - \mathbb{E}[\beta_0 + \tau T_i + \beta_U U_i + \epsilon_i | Z_i = 0] \\ &= \beta_0 + \tau \mathbb{E}[T_i | Z_i = 1] + \beta_u \mathbb{E}[U_i | Z_i = 1] + \mathbb{E}[\epsilon_i | Z_i = 1] - \beta_0 - \tau \mathbb{E}[T_i | Z_i = 0] - \beta_u \mathbb{E}[U_i | Z_i = 0] - \mathbb{E}[\epsilon_i | Z_i = 0] \\ &= \tau (\mathbb{E}[T_i | Z_i = 1] - \mathbb{E}[T_i | Z_i = 0]) + \beta_u (\mathbb{E}[U_i | Z_i = 1] - \mathbb{E}[U_i | Z_i = 0]) \\ &= \tau (\mathbb{E}[T_i | Z_i = 1] - \mathbb{E}[T_i | Z_i = 0]) + \beta_u (\mathbb{E}[U_i] - \mathbb{E}[U_i]) = \tau (\mathbb{E}[T_i | Z_i = 1] - \mathbb{E}[T_i | Z_i = 0]) \end{align*} \]
Wald Estimand
:\[ \tau = \frac{\mathbb{E}[Y_i | Z_i = 1] - \mathbb{E}[Y_i | Z_i = 0]}{\mathbb{E}[T_i | Z_i = 1] - \mathbb{E}[T_i | Z_i = 0]} \]
Wald Estimator
:\[ \hat{\tau} = \frac{\sum_{i=1}^n Y_i \cdot Z_i - \sum_{i=1}^n Y_i \cdot (1 - Z_i)}{\sum_{i=1}^n T_i \cdot Z_i - \sum_{i=1}^n T_i \cdot (1 - Z_i)} \]
Relevance
assumption ensures that denominator is not zero.exclusion restriction
for \(Z_i\): \(Y_i = \beta_0 + \tau T_i + \beta_U U_i + \epsilon_i\).Wald Estimand
for the treatment effect in this IV setting in the following steps:
instrumental unconfoundedness
- rearrange.instrumental unconfoundedness
assumption again.\[ \begin{align*} Cov(Y_i, Z_i) &= \mathbb{E}[Y_i Z_i] - \mathbb{E}[Y_i] \mathbb{E}[Z_i] = \mathbb{E}[(\beta_0 + \tau T_i + \beta_U U_i + \epsilon_i) Z_i] - \mathbb{E}[\beta_0 + \tau T_i + \beta_U U_i + \epsilon_i] \mathbb{E}[Z_i] \\ &= \beta_0\mathbb{E}[Z_i] + \tau\mathbb{E}[T_iZ_i] + \beta_U \mathbb{E}[U_iZ_i] + \mathbb{E}[\epsilon_iZ_i] - \beta_0\mathbb{E}[Z_i] - \tau\mathbb{E}[T_i]\mathbb{E}[Z_i] - \beta_U\mathbb{E}[U_i]\mathbb{E}[Z_i] - \mathbb{E}[\epsilon_i]\mathbb{E}[Z_i]\\ &= \tau\mathbb{E}[T_iZ_i] + \beta_U \mathbb{E}[U_iZ_i] - \tau\mathbb{E}[T_i]\mathbb{E}[Z_i] - \beta_U\mathbb{E}[U_i]\mathbb{E}[Z_i] = \tau(\mathbb{E}[T_iZ_i] - \mathbb{E}[T_i]\mathbb{E}[Z_i]) + \beta_U (\mathbb{E}[U_iZ_i] - \mathbb{E}[U_i]\mathbb{E}[Z_i])\\ &= \tau Cov(T_i, Z_i) + \beta_U Cov(U_i, Z_i) = \tau Cov(T_i, Z_i) \end{align*} \]
Wald Estimand
:\[ \tau = \frac{Cov(Y_i, Z_i)}{Cov(T_i, Z_i)} \]
Wald Estimator
:\[ \hat{\tau} = \frac{\sum_{i=1}^n Y_i \cdot Z_i - \bar{Y} \bar{Z}}{\sum_{i=1}^n T_i \cdot Z_i - \bar{T} \bar{Z}} \]
Relevance
assumption ensures that denominator is not zero.Stage
: Linearly regress \(T_i\) on \(Z_i\) to estimate \(\mathbb{E}[T_i|Z_i]\). This gives the projection of \(T_i\) onto \(Z_i\): \(\hat{T}_i\).
Stage
: Linearly regress \(Y_i\) on \(\hat{T}_i\) to estimate \(\mathbb{E}[Y_i|\hat{T}_i]\). Obtain the estimate \(\hat{\tau}\) as the fitted coefficient of \(\hat{T}_i\).
Compliers
always take the treatment that they’re encouraged to take: \(T_i(1) = 1\) and \(T_i(0) = 0\).Always-Takers
always take the treatment, regardless of encouragement: \(T_i(1) = 1\) and \(T_i(0) = 1\).Never-Takers
never take the treatment, regardless of encouragement: \(T_i(1) = 0\) and \(T_i(0) = 0\).Defiers
always take the opposite treatment that they’re encouraged to take: \(T_i(1) = 0\) and \(T_i(0) = 1\).Definition: “Local Average Treatment Effect (LATE) / Complier Average Causal Effect (CACE)”
\(\mathbb{E}[Y_i(T_i=1) - Y_i(T_i=0) | T_i(Z_i=1) = 1, T_i(Z_i=0) = 0]\)
Assumption: “Monotonicity”
\(\forall i, T_i(Z_i=1) \geq T_i(Z_i=0)\)
Theorem: “LATE Nonparametric Identification”
\(\mathbb{E}[Y_i(1) - Y_i(0) | T_i(1) = 1, T_i(0) = 0] = \frac{\mathbb{E}[Y_i | Z_i = 1] - \mathbb{E}[Y_i | Z_i = 0]}{\mathbb{E}[T_i | Z_i = 1] - \mathbb{E}[T_i | Z_i = 0]}\)
Intention-to-Treat (ITT) Effect
; denominator is 1st-stage effect or Complier Share
.\[ \begin{align*} \mathbb{E}[Y_i(Z_i=1) - Y_i(Z_i=0)] &= \mathbb{E}[Y_1(Z_i = 1) - Y_i(Z_i = 0) \mid T_i(1) = 1, T_i(0) = 0]P(T_i(1) = 1, T_i(0) = 0) \quad \text{(compliers)} \\ &+ \mathbb{E}[Y_1(Z_i = 1) - Y_i(Z_i = 0) \mid T_i(1) = 0, T_i(0) = 1]P(T_i(1) = 0, T_i(0) = 1) \quad \text{(defiers)}\\ &+ \mathbb{E}[Y_1(Z_i = 1) - Y_i(Z_i = 0) \mid T_i(1) = 1, T_i(0) = 1]P(T_i(1) = 1, T_i(0) = 1) \quad \text{(always-takers)}\\ &+ \mathbb{E}[Y_1(Z_i = 1) - Y_i(Z_i = 0) \mid T_i(1) = 0, T_i(0) = 0]P(T_i(1) = 0, T_i(0) = 0) \quad \text{(never-takers)}\\ \end{align*} \]
\[ \begin{align*} \mathbb{E}[Y_1(Z_i = 1) - Y_i(Z_i = 0) \mid T_i(1) = 1, T_i(0) = 0] = \frac{\mathbb{E}[Y_i(Z_i=1) - Y_i(Z_i=0)]}{P(T_i(1) = 1, T_i(0) = 0)} \end{align*} \]
\[ \begin{align*} \mathbb{E}[Y_1(T_i = 1) - Y_i(T_i = 0) \mid T_i(1) = 1, T_i(0) = 0] = \frac{\mathbb{E}[Y_i(Z_i=1) - Y_i(Z_i=0)]}{P(T_i(1) = 1, T_i(0) = 0)} = \frac{\mathbb{E}[Y_i|Z_i=1] - \mathbb{E}[Y_i|Z_i=0)]}{P(T_i(1) = 1, T_i(0) = 0)} \end{align*} \]
\[ \begin{align*} \mathbb{E}[Y_1(T_i = 1) - Y_i(T_i = 0) \mid T_i(1) = 1, T_i(0) = 0] &= \frac{\mathbb{E}[Y_i|Z_i=1] - \mathbb{E}[Y_i|Z_i=0)]}{P(T_i(1) = 1, T_i(0) = 0)} \\ &= \frac{\mathbb{E}[Y_i|Z_i=1] - \mathbb{E}[Y_i|Z_i=0)]}{1 - P(T_i = 0 | Z_i = 1) - P(T_i = 1 | Z_i = 0)} \\ \end{align*} \]
\[ \begin{align*} \mathbb{E}[Y_1(T_i = 1) - Y_i(T_i = 0) \mid T_i(1) = 1, T_i(0) = 0] &= \frac{\mathbb{E}[Y_i|Z_i=1] - \mathbb{E}[Y_i|Z_i=0)]}{1 - (1 - P(T_i = 1 | Z_i = 1)) - P(T_i = 1 | Z_i = 0)} \\ &= \frac{\mathbb{E}[Y_i|Z_i=1] - \mathbb{E}[Y_i|Z_i=0)]}{P(T_i = 1 | Z_i = 1) - P(T_i = 1 | Z_i = 0)} \\ \end{align*} \]
\[ \begin{align*} \mathbb{E}[Y_1(T_i = 1) - Y_i(T_i = 0) \mid T_i(1) = 1, T_i(0) = 0] = \frac{\mathbb{E}[Y_i|Z_i=1] - \mathbb{E}[Y_i|Z_i=0)]}{\mathbb{E}[T_i | Z_i = 1] - \mathbb{E}[T_i | Z_i = 0)]} \\ \end{align*} \]
library(hdm) # for the data
data(pension) # Get data
Y = pension$net_tfa # Outcome
Z = pension$e401 # Instrument
T = pension$p401 # Treatment
ITT=mean(Y[Z==1])-mean(Y[Z==0]) # estimate intention-to-treat effect (ITT)
first=mean(T[Z==1])-mean(T[Z==0]) # estimate first stage effect (complier share)
LATE=ITT/first # compute LATE
ITT; first; LATE # show ITT, first stage effect, and LATE
[1] 19559.34
[1] 0.7045084
[1] 27763.11
library(hdm) # for the data
library(AER) # load AER package for ivreg (2SLS)
data(pension) # Get data
Y = pension$net_tfa # Outcome
Z = pension$e401 # Instrument
T = pension$p401 # Treatment
LATE=ivreg(Y~T|Z) # run two stage least squares regression
summary(LATE,vcov = vcovHC) # results with heteroscedasticity-robust se
Call:
ivreg(formula = Y ~ T | Z)
Residuals:
Min 1Q Median 3Q Max
-513090 -15011 -10788 -1624 1498247
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10788.0 690.6 15.62 <2e-16 ***
T 27763.1 1985.4 13.98 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 62380 on 9913 degrees of freedom
Multiple R-Squared: 0.03586, Adjusted R-squared: 0.03577
Wald test: 195.5 on 1 and 9913 DF, p-value: < 2.2e-16
\(T_i\) is additively separable and we require conditional unconfoundedness
of the instrument \(Z_i\): \[\begin{align}\begin{aligned}Y_i = \tau T_i + g(\mathbf{X_i}) + \epsilon_{Y_i}, & &\mathbb{E}(\epsilon_{Y_i} | Z_i,\mathbf{X_i}) = 0 \\Z_i = h(\mathbf{X_i}) + \epsilon_{Z_i}, & &\mathbb{E}(\epsilon_{Z_i} | \mathbf{X_i}) = 0\end{aligned}\end{align}\]
Robinson (1988)-style/ partialling-out
version of the Wald estimand
:
\[\tau = \frac{\mathbb{E}[(Y_i - \mu(\mathbf{X_i})) (Z_i - h(\mathbf{X_i}))]}{\mathbb{E}[(T_i - e(\mathbf{X_i})) (Z_i - h(\mathbf{X_i}))]} = \frac{\text{Cov}[(Y_i - \mu(\mathbf{X_i})), (Z_i - h(\mathbf{X_i}))]}{\text{Cov}[(T_i - e(\mathbf{X_i})), (Z_i - h(\mathbf{X_i}))]}\]
Nuisance parameters:
\(\quad \mu(\mathbf{X_i}) = \mathbb{E}[Y_i \mid \mathbf{X_i}] \quad \quad \quad e(\mathbf{X_i}) = \mathbb{E}[T_i \mid \mathbf{X_i}] \quad \quad \quad h(\mathbf{X_i}) = \mathbb{E}[Z_i \mid \mathbf{X_i}]\)moment condition of a Neyman-orthogonal score
with the estimand as solution:\[ \begin{align} \mathbb{E} [ ( Y_i - \mu(X) - \tau (T_i - e(\mathbf{X_i})) ) (Z - h(\mathbf{X_i})) ] &= 0 \\ \mathbb{E} \left[ (Y_i - \mu(\mathbf{X_i}))(Z_i - h(\mathbf{X_i})) - \tau (T_i - e(\mathbf{X_i}))(Z_i - h(\mathbf{X_i})) \right] &= 0 \\ \tau \mathbb{E} [ \underbrace{(-1)(T_i - e(\mathbf{X_i}))(Z_i - h(\mathbf{X_i}))}_{\psi_a} ] + \mathbb{E} [ \underbrace{(Y_i - \mu(\mathbf{X_i}))(Z_i - h(\mathbf{X_i}))}_{\psi_b} ] &= 0 \end{align} \]
conditional unconfoundedness
of \(Z_i\): \[\begin{align}\begin{aligned}Y_i = g(T_i, \mathbf{X_i}) + \epsilon_{Y_i}, & &\mathbb{E}(\epsilon_{Y_i} | Z_i,\mathbf{X_i}) = 0 \\Z_i = h(\mathbf{X_i}) + \epsilon_{Z_i}, & &\mathbb{E}(\epsilon_{Z_i} | \mathbf{X_i}) = 0\end{aligned}\end{align}\]\[\tau_{\text{LATE}} = \frac{\mathbb{E}\left[\mu(1, \mathbf{X_i}) - \mu(0, \mathbf{X_i}) + \frac{Z_i(Y_i - \mu(1, \mathbf{X_i}))}{h(\mathbf{X_i})} - \frac{(1-Z_i)(Y_i - \mu(0, \mathbf{X_i})}{(1-h(\mathbf{X_i}))} \right]}{\mathbb{E}\left[e(1, \mathbf{X_i}) - e(0, \mathbf{X_i}) + \frac{Z_i(T_i - e(1, \mathbf{X_i}))}{h(\mathbf{X_i})} - \frac{(1-Z_i)(T_i - e(0, \mathbf{X_i})}{(1-h(\mathbf{X_i}))} \right]}\]
Nuisance parameters:
\(\quad \mu(Z_i, \mathbf{X_i}) = \mathbb{E}[Y_i \mid Z_i, \mathbf{X_i}] \quad \quad \quad e(\mathbf{Z_i, X_i}) = P[T_i \mid Z_i, \mathbf{X_i}] \quad \quad \quad h(\mathbf{X_i}) = P[Z_i \mid \mathbf{X_i}]\)moment condition of a Neyman-orthogonal score
with the estimand as solution:\[ \begin{align} \mathbb{E}[\psi_b] + \mathbb{E}[\psi_a] \cdot \tau_{\text{LATE}} = &\mathbb{E}\bigg[\mu(1, \mathbf{X_i}) - \mu(0, \mathbf{X_i}) + \frac{Z_i(Y_i - \mu(1, \mathbf{X_i}))}{h(\mathbf{X_i})} - \frac{(1-Z_i)(Y_i - \mu(0, \mathbf{X_i})}{(1-h(\mathbf{X_i}))} \bigg] \\ &+ \mathbb{E}\bigg[(-1)\left[e(1, \mathbf{X_i}) - e(0, \mathbf{X_i}) + \frac{Z_i(T_i - e(1, \mathbf{X_i}))}{h(\mathbf{X_i})} - \frac{(1-Z_i)(T_i - e(0, \mathbf{X_i})}{(1-h(\mathbf{X_i}))}\right] \bigg] \cdot \tau_{\text{LATE}} = 0 \end{align} \]
# Load required packages
library(DoubleML)
library(mlr3)
library(mlr3learners)
library(data.table)
# suppress messages during fitting
lgr::get_logger("mlr3")$set_threshold("warn")
# load data as a data.table
data = fetch_401k(return_type = "data.table", instrument = TRUE)
# Set up basic model: Specify variables for data-backend
features_base = c("age", "inc", "educ", "fsize","marr", "twoearn", "db", "pira", "hown")
# Initialize DoubleMLData (data-backend of DoubleML)
data_dml_base = DoubleMLData$new(data,
y_col = "net_tfa", # outcome variable
d_cols = "p401", # treatment variable
x_cols = features_base, # covariates
z_cols = "e401") # instrument
# Initialize Random Forrest Learner
randomForest = lrn("regr.ranger")
randomForest_class = lrn("classif.ranger")
# Random Forest
set.seed(123)
dml_iivm_forest = DoubleMLIIVM$new(data_dml_base,
ml_g = randomForest,
ml_m = randomForest_class,
ml_r = randomForest_class,
n_folds = 3,
score = "LATE", # only choice for Interactive IV models
trimming_threshold = 0.01,
subgroups = list(always_takers = FALSE, # not in sample: no participation w/o eligibility.
never_takers = TRUE))
# Set nuisance-part specific parameters
dml_iivm_forest$set_ml_nuisance_params(
"ml_g0", "p401",
list(max.depth = 6, mtry = 4, min.node.size = 7)) # mu(Z=0,X) = E[Y | Z=0, X]
dml_iivm_forest$set_ml_nuisance_params(
"ml_g1", "p401",
list(max.depth = 6, mtry = 3, min.node.size = 5)) # mu(Z=1,X) = E[Y | Z=1, X]
dml_iivm_forest$set_ml_nuisance_params(
"ml_m", "p401",
list(max.depth = 6, mtry = 3, min.node.size = 6)) # h(X) = P(Z=1 | X)
dml_iivm_forest$set_ml_nuisance_params(
"ml_r1", "p401",
list(max.depth = 4, mtry = 7, min.node.size = 6)) # e(Z,X) = P(T=1 | Z, X)
dml_iivm_forest$fit()
dml_iivm_forest$summary()
Estimates and significance testing of the effect of target variables
Estimate. Std. Error t value Pr(>|t|)
p401 11694 1603 7.294 3e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Thank you for your attention! | |
Causal Data Science: (7) Unobserved Confounding and Instrumental Variables