Skip to content

Commit 2162f73

Browse files
committed
add simple did multi simulation
1 parent 8083798 commit 2162f73

File tree

3 files changed

+172
-0
lines changed

3 files changed

+172
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Learner g,Learner m,Score,In-sample-norm.,DGP,level,Coverage,CI Length,Bias,Uniform Coverage,Uniform CI Length,repetition
2+
Linear,Linear,observational,False,1,0.9,0.9029166666666666,0.3186627096956461,0.07711696371410508,0.901,0.49537746791429166,1000
3+
Linear,Linear,observational,False,1,0.95,0.95125,0.3797100386232941,0.07711696371410508,0.95,0.5448416509948917,1000
4+
Linear,Linear,observational,True,1,0.9,0.9003333333333333,0.3167085369567106,0.07700388113671185,0.901,0.49287577992610865,1000
5+
Linear,Linear,observational,True,1,0.95,0.9498333333333334,0.37738149818350913,0.07700388113671185,0.948,0.541465330653648,1000
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
DoubleML Version,Script,Date,Total Runtime (seconds),Python Version
2+
0.10.dev0,did_pa_multi_coverage.py,2025-03-12 16:23:08,1368.6782059669495,3.11.9
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import numpy as np
2+
import pandas as pd
3+
from datetime import datetime
4+
import time
5+
import sys
6+
7+
from sklearn.linear_model import LinearRegression, LogisticRegression
8+
9+
import doubleml as dml
10+
from doubleml.did.datasets import make_did_CS2021
11+
12+
# Number of repetitions
13+
n_rep = 1000
14+
max_runtime = 5.5 * 3600 # 5.5 hours in seconds
15+
16+
# DGP pars
17+
dgp_dict = {}
18+
19+
df_oracle = make_did_CS2021(n_obs=int(1e+6), dgp_type=1)
20+
df_oracle["ite"] = df_oracle["y1"] - df_oracle["y0"]
21+
df_oracle_thetas = df_oracle.groupby(["d", "t"])["ite"].mean().reset_index()
22+
# drop
23+
print(df_oracle_thetas)
24+
25+
n_obs = 2000
26+
27+
# to get the best possible comparison between different learners (and settings) we first simulate all datasets
28+
np.random.seed(42)
29+
30+
dgp_types = [1]
31+
n_dgps = len(dgp_types)
32+
datasets = []
33+
for dgp_type in dgp_types:
34+
datasets_dgp = []
35+
for i in range(n_rep):
36+
data = make_did_CS2021(n_obs=n_obs, dgp_type=dgp_type)
37+
datasets_dgp.append(data)
38+
datasets.append(datasets_dgp)
39+
40+
41+
# set up hyperparameters
42+
hyperparam_dict = {
43+
"DGP": dgp_types,
44+
"score": ["observational"],
45+
"in sample normalization": [True, False],
46+
"learner_g": [("Linear", LinearRegression()),],
47+
"learner_m": [("Linear", LogisticRegression()),],
48+
"level": [0.95, 0.90]
49+
}
50+
51+
# set up the results dataframe
52+
df_results_detailed = pd.DataFrame()
53+
54+
# start simulation
55+
np.random.seed(42)
56+
start_time = time.time()
57+
58+
for i_rep in range(n_rep):
59+
print(f"Repetition: {i_rep + 1}/{n_rep}", end="\r")
60+
61+
# Check the elapsed time
62+
elapsed_time = time.time() - start_time
63+
if elapsed_time > max_runtime:
64+
print("Maximum runtime exceeded. Stopping the simulation.")
65+
break
66+
67+
for i_dgp, dgp_type in enumerate(dgp_types):
68+
# define the DoubleML data object
69+
obj_dml_data = dml.data.DoubleMLPanelData(
70+
datasets[i_dgp][i_rep],
71+
y_col="y",
72+
d_cols="d",
73+
id_col="id",
74+
t_col="t",
75+
x_cols=["Z1", "Z2", "Z3", "Z4"],
76+
)
77+
78+
for learner_g_idx, (learner_g_name, ml_g) in enumerate(hyperparam_dict["learner_g"]):
79+
for learner_m_idx, (learner_m_name, ml_m) in enumerate(hyperparam_dict["learner_m"]):
80+
for score in hyperparam_dict["score"]:
81+
for in_sample_normalization in hyperparam_dict["in sample normalization"]:
82+
if score == "experimental":
83+
dml_DiD = dml.did.DoubleMLDIDMulti(
84+
obj_dml_data=obj_dml_data,
85+
ml_g=ml_g,
86+
ml_m=None,
87+
gt_combinations="standard",
88+
score=score,
89+
in_sample_normalization=in_sample_normalization)
90+
else:
91+
assert score == "observational"
92+
dml_DiD = dml.did.DoubleMLDIDMulti(
93+
obj_dml_data=obj_dml_data,
94+
ml_g=ml_g,
95+
ml_m=ml_m,
96+
gt_combinations="standard",
97+
score=score,
98+
in_sample_normalization=in_sample_normalization)
99+
dml_DiD.fit(n_jobs_cv=5)
100+
101+
# oracle values
102+
theta = np.full_like(dml_DiD.coef, np.nan)
103+
for i, (g, _, t) in enumerate(dml_DiD.gt_combinations):
104+
group_index = df_oracle_thetas["d"] == g
105+
time_index = df_oracle_thetas["t"] == t
106+
theta[i] = df_oracle_thetas[group_index & time_index]["ite"].iloc[0]
107+
108+
for level_idx, level in enumerate(hyperparam_dict["level"]):
109+
confint = dml_DiD.confint(level=level)
110+
coverage = np.mean((confint.iloc[:, 0] < theta) & (theta < confint.iloc[:, 1]))
111+
ci_length = np.mean(confint.iloc[:, 1] - confint.iloc[:, 0])
112+
bias = np.mean(abs(dml_DiD.coef - theta))
113+
114+
dml_DiD.bootstrap(n_rep_boot=2000)
115+
confint_uniform = dml_DiD.confint(level=level, joint=True)
116+
117+
coverage_uniform = all((confint_uniform.iloc[:, 0] < theta) & (theta < confint_uniform.iloc[:, 1]))
118+
ci_length_uniform = np.mean(confint_uniform.iloc[:, 1] - confint_uniform.iloc[:, 0])
119+
120+
df_results_detailed = pd.concat(
121+
(df_results_detailed,
122+
pd.DataFrame({
123+
"Coverage": coverage,
124+
"CI Length": ci_length,
125+
"Bias": bias,
126+
"Uniform Coverage": coverage_uniform,
127+
"Uniform CI Length": ci_length_uniform,
128+
"Learner g": learner_g_name,
129+
"Learner m": learner_m_name,
130+
"Score": score,
131+
"In-sample-norm.": in_sample_normalization,
132+
"DGP": dgp_type,
133+
"level": level,
134+
"repetition": i_rep}, index=[0])),
135+
ignore_index=True)
136+
137+
df_results = df_results_detailed.groupby(
138+
["Learner g", "Learner m", "Score", "In-sample-norm.", "DGP", "level"]).agg(
139+
{"Coverage": "mean",
140+
"CI Length": "mean",
141+
"Bias": "mean",
142+
"Uniform Coverage": "mean",
143+
"Uniform CI Length": "mean",
144+
"repetition": "count"}
145+
).reset_index()
146+
print(df_results)
147+
148+
end_time = time.time()
149+
total_runtime = end_time - start_time
150+
151+
# save results
152+
script_name = "did_pa_multi_coverage.py"
153+
path = "results/did/did_pa_multi_coverage"
154+
155+
metadata = pd.DataFrame({
156+
'DoubleML Version': [dml.__version__],
157+
'Script': [script_name],
158+
'Date': [datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
159+
'Total Runtime (seconds)': [total_runtime],
160+
'Python Version': [f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"],
161+
})
162+
print(metadata)
163+
164+
df_results.to_csv(f"{path}.csv", index=False)
165+
metadata.to_csv(f"{path}_metadata.csv", index=False)

0 commit comments

Comments
 (0)