-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathplot_survival_analysis.py
301 lines (243 loc) · 8.09 KB
/
plot_survival_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# Authors: Badr Moufad
# Mathurin Massias
"""
========================================================
Comparison of lifelines with skglm for survival analysis
========================================================
This example shows that ``skglm`` fits a Cox model exactly as ``lifelines`` but with
x100 less time.
"""
# %%
# Data
# ----
#
# Let's first generate synthetic data on which to run the Cox estimator,
# using ``skglm`` data utils.
#
from skglm.utils.data import make_dummy_survival_data
n_samples, n_features = 500, 100
X, y = make_dummy_survival_data(
n_samples, n_features,
normalize=True,
random_state=0
)
tm, s = y[:, 0], y[:, 1]
# %%
# The synthetic data has the following properties:
#
# * ``X`` is the matrix of predictors, generated using standard normal distribution with Toeplitz covariance.
# * ``tm`` is the vector of occurrence times which follows a Weibull(1) distribution
# * ``s`` indicates the observations censorship and follows a Bernoulli(0.5) distribution
#
# Let's inspect the data quickly:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(
1, 3,
figsize=(6, 2),
tight_layout=True,
)
dists = (tm, s, X[:, 5])
axes_title = ("times", "censorship", "fifth predictor")
for idx, (dist, name) in enumerate(zip(dists, axes_title)):
axes[idx].hist(dist, bins="auto")
axes[idx].set_title(name)
_ = axes[0].set_ylabel("count")
# %%
# Fitting the Cox Estimator
# -------------------------
#
# After generating the synthetic data, we can now fit a L1-regularized Cox estimator.
# Todo so, we need to combine a Cox datafit and a :math:`\ell_1` penalty
# and solve the resulting problem using skglm Proximal Newton solver ``ProxNewton``.
# We set the intensity of the :math:`\ell_1` regularization to ``alpha=1e-2``.
from skglm.penalties import L1
from skglm.datafits import Cox
from skglm.solvers import ProxNewton
# regularization intensity
alpha = 1e-2
# skglm internals: init datafit and penalty
datafit = Cox()
penalty = L1(alpha)
datafit.initialize(X, y)
# init solver
solver = ProxNewton(fit_intercept=False, max_iter=50)
# solve the problem
w_sk = solver.solve(X, y, datafit, penalty)[0]
# %%
# For this data a regularization value a relatively sparse solution is found:
print(
"Number of nonzero coefficients in solution: "
f"{(w_sk != 0).sum()} out of {len(w_sk)}."
)
# %%
# Let's solve the problem with ``lifelines`` through its ``CoxPHFitter``
# estimator and compare the objectives found by the two packages.
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
# format data
stacked_y_X = np.hstack((y, X))
df = pd.DataFrame(stacked_y_X)
# fit lifelines estimator
lifelines_estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.).fit(
df,
duration_col=0,
event_col=1
)
w_ll = lifelines_estimator.params_.values
# %%
# Check that both solvers find solutions having the same objective value:
obj_sk = datafit.value(y, w_sk, X @ w_sk) + penalty.value(w_sk)
obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll)
print(f"Objective skglm: {obj_sk:.6f}")
print(f"Objective lifelines: {obj_ll:.6f}")
print(f"Difference: {(obj_sk - obj_ll):.2e}")
# %%
# We can do the same to check how close the two solutions are.
print(f"Euclidean distance between solutions: {np.linalg.norm(w_sk - w_ll):.3e}")
# %%
# Timing comparison
# -----------------
#
# Now that we checked that both ``skglm`` and ``lifelines`` yield the same results,
# let's compare their execution time. To get the evolution of the suboptimality
# (objective - optimal objective) we run both estimators with increasing number of
# iterations.
import time
import warnings
warnings.filterwarnings('ignore')
# where to save records
records = {
"skglm": {"times": [], "objs": []},
"lifelines": {"times": [], "objs": []},
}
# time skglm
max_runs = 20
for n_iter in range(1, max_runs + 1):
solver.max_iter = n_iter
start = time.perf_counter()
w = solver.solve(X, y, datafit, penalty)[0]
end = time.perf_counter()
records["skglm"]["objs"].append(
datafit.value(y, w, X @ w) + penalty.value(w)
)
records["skglm"]["times"].append(end - start)
# time lifelines
max_runs = 50
for n_iter in list(range(10)) + list(range(10, max_runs + 1, 5)):
start = time.perf_counter()
lifelines_estimator.fit(
df,
duration_col=0,
event_col=1,
fit_options={"max_steps": n_iter},
)
end = time.perf_counter()
w = lifelines_estimator.params_.values
records["lifelines"]["objs"].append(
datafit.value(y, w, X @ w) + penalty.value(w)
)
records["lifelines"]["times"].append(end - start)
# cast records as numpy array
for idx, label in enumerate(("skglm", "lifelines")):
for metric in ("objs", "times"):
records[label][metric] = np.asarray(records[label][metric])
# %%
# Results
# -------
fig, ax = plt.subplots(1, 1, tight_layout=True, figsize=(6, 3))
solvers = ("skglm", "lifelines")
optimal_obj = min(records[solver]["objs"].min() for solver in solvers)
# plot evolution of suboptimality
for solver in solvers:
ax.semilogy(
records[solver]["times"],
records[solver]["objs"] - optimal_obj,
label=solver,
marker='o',
)
ax.legend()
ax.set_title("Time to fit a Cox model")
ax.set_ylabel("objective suboptimality")
_ = ax.set_xlabel("time in seconds")
# %%
# According to printed ratio, using ``skglm`` we get the same result as ``lifelines``
# with more than x100 less time!
speed_up = records["lifelines"]["times"][-1] / records["skglm"]["times"][-1]
print(f"speed up ratio: {speed_up:.0f}")
# %%
# Efron estimate
# --------------
#
# The previous results, namely closeness of solutions and timings,
# can be extended to the case of handling tied observation with the Efron estimate.
#
# Let's start by generating data with tied observations. This can be achieved
# by passing in a ``with_ties=True`` to ``make_dummy_survival_data`` function.
X, y = make_dummy_survival_data(
n_samples, n_features,
normalize=True,
with_ties=True,
random_state=0
)
tm, s = y[:, 0], y[:, 1]
# check the data has tied observations
print(f"Number of unique times {len(np.unique(tm))} out of {n_samples}")
# %%
# It is straightforward to fit an :math:`\ell_1` Cox estimator with the Efron estimate.
# We only need to pass in ``use_efron=True`` to the ``Cox`` datafit.
# ensure using Efron estimate
datafit = Cox(use_efron=True)
datafit.initialize(X, y)
# solve the problem
solver = ProxNewton(fit_intercept=False, max_iter=50)
w_sk = solver.solve(X, y, datafit, penalty)[0]
# %%
# Again a relatively sparse solution is found:
print(
"Number of nonzero coefficients in solution: "
f"{(w_sk != 0).sum()} out of {len(w_sk)}."
)
# %%
# Let's do the same with ``lifelines`` and compare the results
# format data
stacked_tm_s_X = np.hstack((tm[:, None], s[:, None], X))
df = pd.DataFrame(stacked_tm_s_X)
# fit lifelines estimator on the new data
lifelines_estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.).fit(
df,
duration_col=0,
event_col=1
)
w_ll = lifelines_estimator.params_.values
# Check that both solvers find solutions with the same objective value
obj_sk = datafit.value(y, w_sk, X @ w_sk) + penalty.value(w_sk)
obj_ll = datafit.value(y, w_ll, X @ w_ll) + penalty.value(w_ll)
print(f"Objective skglm: {obj_sk:.6f}")
print(f"Objective lifelines: {obj_ll:.6f}")
print(f"Difference: {(obj_sk - obj_ll):.2e}")
# Check that both solutions are close
print(f"Euclidean distance between solutions: {np.linalg.norm(w_sk - w_ll):.3e}")
# %%
# Finally, let's compare the timings of both solvers
# time skglm
start = time.perf_counter()
solver.solve(X, y, datafit, penalty)[0]
end = time.perf_counter()
total_time_skglm = end - start
# time lifelines
lifelines_estimator = CoxPHFitter(penalizer=alpha, l1_ratio=1.)
start = time.perf_counter()
lifelines_estimator.fit(
df,
duration_col=0,
event_col=1
)
end = time.perf_counter()
total_time_lifelines = end - start
# deduce speed up ratio
speed_up = total_time_lifelines / total_time_skglm
print(f"speed up ratio: {speed_up:.0f}")
# %%
# As shown by the last print, we still preserve the x100 ratio speed up
# even for the Efron estimate.