-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
Copy pathproximal_hessian.py
503 lines (453 loc) · 21 KB
/
proximal_hessian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
# Copyright 2018 The TensorFlow Probability Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""GLM fitter with Hessian/proximal gradient descent based optimization.
This optimizer uses proximal gradient descent and a step size dependent on the
Hessian to efficiently minimize a convex loss function with L1 and L2
regularization. For GLMs, we approximate the Hessian with the Fisher
information matrix.
"""
import numpy as np
import tensorflow.compat.v2 as tf
from tensorflow_probability.python.glm import fisher_scoring
from tensorflow_probability.python.internal import dtype_util
from tensorflow_probability.python.math.linalg import sparse_or_dense_matvecmul
from tensorflow_probability.python.optimizer import proximal_hessian_sparse
__all__ = [
'fit_sparse',
'fit_sparse_one_step',
]
def _grad_neg_log_likelihood_and_fim(model_matrix, linear_response, response,
model):
"""Computes the neg-log-likelihood gradient and Fisher information for a GLM.
Note that Fisher information is related to the Hessian of the log-likelihood
by the equation
```none
FisherInfo = E[Hessian with respect to model_coefficients of -LogLikelihood(
Y | model_matrix, model_coefficients)]
```
where `LogLikelihood` is the log-likelihood of a generalized linear model
parameterized by `model_matrix` and `model_coefficients`, and the expectation
is taken over Y, distributed according to the same GLM with the same parameter
values.
Args:
model_matrix: (Batch of) matrix-shaped, `float` `Tensor` or `SparseTensor`
where each row represents a sample's features. Has shape `[N, n]` where
`N` is the number of data samples and `n` is the number of features per
sample.
linear_response: (Batch of) vector-shaped `Tensor` with the same dtype as
`model_matrix`, equal to `model_matix @ model_coefficients` where
`model_coefficients` are the coefficients of the linear component of the
GLM.
response: (Batch of) vector-shaped `Tensor` with the same dtype as
`model_matrix` where each element represents a sample's observed response
(to the corresponding row of features).
model: `tfp.glm.ExponentialFamily`-like instance, which specifies the link
function and distribution of the GLM, and thus characterizes the negative
log-likelihood. Must have sufficient statistic equal to the response, that
is, `T(y) = y`.
Returns:
grad_neg_log_likelihood: (Batch of) vector-shaped `Tensor` with the same
shape and dtype as a single row of `model_matrix`, representing the
gradient of the negative log likelihood of `response` given linear
response `linear_response`.
fim_middle: (Batch of) vector-shaped `Tensor` with the same shape and dtype
as a single column of `model_matrix`, satisfying the equation
`Fisher information =
Transpose(model_matrix)
@ diag(fim_middle)
@ model_matrix`.
"""
# TODO(b/111926503): Determine whether there are some practical cases where it
# is computationally favorable to compute the full FIM.
mean, variance, grad_mean = model(linear_response)
is_valid = (
tf.math.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.)
& tf.math.is_finite(variance) & (variance > 0.))
def _mask_if_invalid(x, mask):
return tf.where(
is_valid, x, np.array(mask, dtype_util.as_numpy_dtype(x.dtype)))
# TODO(b/111923449): Link to derivation once it's available.
v = (response - mean) * _mask_if_invalid(grad_mean, 1) / _mask_if_invalid(
variance, np.inf)
grad_log_likelihood = sparse_or_dense_matvecmul(
model_matrix, v, adjoint_a=True)
fim_middle = _mask_if_invalid(grad_mean, 0.)**2 / _mask_if_invalid(
variance, np.inf)
return -grad_log_likelihood, fim_middle
def fit_sparse_one_step(model_matrix,
response,
model,
model_coefficients_start,
tolerance,
l1_regularizer,
l2_regularizer=None,
maximum_full_sweeps=None,
learning_rate=None,
name=None):
"""One step of (the outer loop of) the GLM fitting algorithm.
This function returns a new value of `model_coefficients`, equal to
`model_coefficients_start + model_coefficients_update`. The increment
`model_coefficients_update in R^n` is computed by a coordinate descent method,
that is, by a loop in which each iteration updates exactly one coordinate of
`model_coefficients_update`. (Some updates may leave the value of the
coordinate unchanged.)
The particular update method used is to apply an L1-based proximity operator,
"soft threshold", whose fixed point `model_coefficients_update^*` is the
desired minimum
```none
model_coefficients_update^* = argmin{
-LogLikelihood(model_coefficients_start + model_coefficients_update')
+ l1_regularizer *
||model_coefficients_start + model_coefficients_update'||_1
+ l2_regularizer *
||model_coefficients_start + model_coefficients_update'||_2**2
: model_coefficients_update' }
```
where in each iteration `model_coefficients_update'` has at most one nonzero
coordinate.
This update method preserves sparsity, i.e., tends to find sparse solutions if
`model_coefficients_start` is sparse. Additionally, the choice of step size
is based on curvature (Fisher information matrix), which significantly speeds
up convergence.
Args:
model_matrix: (Batch of) matrix-shaped, `float` `Tensor` or `SparseTensor`
where each row represents a sample's features. Has shape `[N, n]` where
`N` is the number of data samples and `n` is the number of features per
sample.
response: (Batch of) vector-shaped `Tensor` with the same dtype as
`model_matrix` where each element represents a sample's observed response
(to the corresponding row of features).
model: `tfp.glm.ExponentialFamily`-like instance, which specifies the link
function and distribution of the GLM, and thus characterizes the negative
log-likelihood which will be minimized. Must have sufficient statistic
equal to the response, that is, `T(y) = y`.
model_coefficients_start: (Batch of) vector-shaped, `float` `Tensor` with
the same dtype as `model_matrix`, representing the initial values of the
coefficients for the GLM regression. Has shape `[n]` where `model_matrix`
has shape `[N, n]`.
tolerance: scalar, `float` `Tensor` representing the convergence threshold.
The optimization step will terminate early, returning its current value of
`model_coefficients_start + model_coefficients_update`, once the following
condition is met:
`||model_coefficients_update_end - model_coefficients_update_start||_2
/ (1 + ||model_coefficients_start||_2)
< sqrt(tolerance)`,
where `model_coefficients_update_end` is the value of
`model_coefficients_update` at the end of a sweep and
`model_coefficients_update_start` is the value of
`model_coefficients_update` at the beginning of that sweep.
l1_regularizer: scalar, `float` `Tensor` representing the weight of the L1
regularization term (see equation above).
l2_regularizer: scalar, `float` `Tensor` representing the weight of the L2
regularization term (see equation above).
Default value: `None` (i.e., no L2 regularization).
maximum_full_sweeps: Python integer specifying maximum number of sweeps to
run. A "sweep" consists of an iteration of coordinate descent on each
coordinate. After this many sweeps, the algorithm will terminate even if
convergence has not been reached.
Default value: `1`.
learning_rate: scalar, `float` `Tensor` representing a multiplicative factor
used to dampen the proximal gradient descent steps.
Default value: `None` (i.e., factor is conceptually `1`).
name: Python string representing the name of the TensorFlow operation. The
default name is `"fit_sparse_one_step"`.
Returns:
model_coefficients: (Batch of) `Tensor` having the same shape and dtype as
`model_coefficients_start`, representing the updated value of
`model_coefficients`, that is, `model_coefficients_start +
model_coefficients_update`.
is_converged: scalar, `bool` `Tensor` indicating whether convergence
occurred across all batches within the specified number of sweeps.
iter: scalar, `int` `Tensor` representing the actual number of coordinate
updates made (before achieving convergence). Since each sweep consists of
`tf.size(model_coefficients_start)` iterations, the maximum number of
updates is `maximum_full_sweeps * tf.size(model_coefficients_start)`.
"""
with tf.name_scope(name or 'fit_sparse_one_step'):
predicted_linear_response = (
fisher_scoring.compute_predicted_linear_response(
model_matrix, model_coefficients_start))
g, h_middle = _grad_neg_log_likelihood_and_fim(
model_matrix, predicted_linear_response, response, model)
return proximal_hessian_sparse.minimize_one_step(
gradient_unregularized_loss=g,
hessian_unregularized_loss_outer=model_matrix,
hessian_unregularized_loss_middle=h_middle,
x_start=model_coefficients_start,
l1_regularizer=l1_regularizer,
l2_regularizer=l2_regularizer,
maximum_full_sweeps=maximum_full_sweeps,
tolerance=tolerance,
learning_rate=learning_rate,
name=name)
def fit_sparse(model_matrix,
response,
model,
model_coefficients_start,
tolerance,
l1_regularizer,
l2_regularizer=None,
maximum_iterations=None,
maximum_full_sweeps_per_iteration=1,
learning_rate=None,
name=None):
r"""Fits a GLM using coordinate-wise FIM-informed proximal gradient descent.
This function uses a L1- and L2-regularized, second-order quasi-Newton method
to find maximum-likelihood parameters for the given model and observed data.
The second-order approximations use negative Fisher information in place of
the Hessian, that is,
```none
FisherInfo = E_Y[Hessian with respect to model_coefficients of -LogLikelihood(
Y | model_matrix, current value of model_coefficients)]
```
For large, sparse data sets, `model_matrix` should be supplied as a
`SparseTensor`.
Args:
model_matrix: (Batch of) matrix-shaped, `float` `Tensor` or `SparseTensor`
where each row represents a sample's features. Has shape `[N, n]` where
`N` is the number of data samples and `n` is the number of features per
sample.
response: (Batch of) vector-shaped `Tensor` with the same dtype as
`model_matrix` where each element represents a sample's observed response
(to the corresponding row of features).
model: `tfp.glm.ExponentialFamily`-like instance, which specifies the link
function and distribution of the GLM, and thus characterizes the negative
log-likelihood which will be minimized. Must have sufficient statistic
equal to the response, that is, `T(y) = y`.
model_coefficients_start: (Batch of) vector-shaped, `float` `Tensor` with
the same dtype as `model_matrix`, representing the initial values of the
coefficients for the GLM regression. Has shape `[n]` where `model_matrix`
has shape `[N, n]`.
tolerance: scalar, `float` `Tensor` representing the tolerance for each
optimization step; see the `tolerance` argument of `fit_sparse_one_step`.
l1_regularizer: scalar, `float` `Tensor` representing the weight of the L1
regularization term.
l2_regularizer: scalar, `float` `Tensor` representing the weight of the L2
regularization term.
Default value: `None` (i.e., no L2 regularization).
maximum_iterations: Python integer specifying maximum number of iterations
of the outer loop of the optimizer (i.e., maximum number of calls to
`fit_sparse_one_step`). After this many iterations of the outer loop, the
algorithm will terminate even if the return value `model_coefficients` has
not converged.
Default value: `1`.
maximum_full_sweeps_per_iteration: Python integer specifying the maximum
number of coordinate descent sweeps allowed in each iteration.
Default value: `1`.
learning_rate: scalar, `float` `Tensor` representing a multiplicative factor
used to dampen the proximal gradient descent steps.
Default value: `None` (i.e., factor is conceptually `1`).
name: Python string representing the name of the TensorFlow operation.
The default name is `"fit_sparse"`.
Returns:
model_coefficients: (Batch of) `Tensor` of the same shape and dtype as
`model_coefficients_start`, representing the computed model coefficients
which minimize the regularized negative log-likelihood.
is_converged: scalar, `bool` `Tensor` indicating whether the minimization
procedure converged across all batches within the specified number of
iterations. Here convergence means that an iteration of the inner loop
(`fit_sparse_one_step`) returns `True` for its `is_converged` output
value.
iter: scalar, `int` `Tensor` indicating the actual number of iterations of
the outer loop of the optimizer completed (i.e., number of calls to
`fit_sparse_one_step` before achieving convergence).
#### Example
```python
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions
def make_dataset(n, d, link, scale=1., dtype=np.float32):
model_coefficients = tfd.Uniform(
low=np.array(-1, dtype), high=np.array(1, dtype)).sample(
d, seed=42)
radius = np.sqrt(2.)
model_coefficients *= radius / tf.linalg.norm(model_coefficients)
mask = tf.random.shuffle(tf.range(d)) < tf.to_int32(0.5 * tf.to_float(d))
model_coefficients = tf.where(mask, model_coefficients,
tf.zeros_like(model_coefficients))
model_matrix = tfd.Normal(
loc=np.array(0, dtype), scale=np.array(1, dtype)).sample(
[n, d], seed=43)
scale = tf.convert_to_tensor(scale, dtype)
linear_response = tf.matmul(model_matrix,
model_coefficients[..., tf.newaxis])[..., 0]
if link == 'linear':
response = tfd.Normal(loc=linear_response, scale=scale).sample(seed=44)
elif link == 'probit':
response = tf.cast(
tfd.Normal(loc=linear_response, scale=scale).sample(seed=44) > 0,
dtype)
elif link == 'logit':
response = tfd.Bernoulli(logits=linear_response).sample(seed=44)
else:
raise ValueError('unrecognized true link: {}'.format(link))
return model_matrix, response, model_coefficients, mask
with tf.Session() as sess:
x_, y_, model_coefficients_true_, _ = sess.run(make_dataset(
n=int(1e5), d=100, link='probit'))
model = tfp.glm.Bernoulli()
model_coefficients_start = tf.zeros(x_.shape[-1], np.float32)
model_coefficients, is_converged, num_iter = tfp.glm.fit_sparse(
model_matrix=tf.convert_to_tensor(x_),
response=tf.convert_to_tensor(y_),
model=model,
model_coefficients_start=model_coefficients_start,
l1_regularizer=800.,
l2_regularizer=None,
maximum_iterations=10,
maximum_full_sweeps_per_iteration=10,
tolerance=1e-6,
learning_rate=None)
model_coefficients_, is_converged_, num_iter_ = sess.run([
model_coefficients, is_converged, num_iter])
print("is_converged:", is_converged_)
print(" num_iter:", num_iter_)
print("\nLearned / True")
print(np.concatenate(
[[model_coefficients_], [model_coefficients_true_]], axis=0).T)
# ==>
# is_converged: True
# num_iter: 1
#
# Learned / True
# [[ 0. 0. ]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0.11195257 0.12484948]
# [ 0. 0. ]
# [ 0.05191106 0.06394956]
# [-0.15090358 -0.15325639]
# [-0.18187316 -0.18825999]
# [-0.06140942 -0.07994166]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0.14474444 0.15810856]
# [ 0. 0. ]
# [-0.25249591 -0.24260855]
# [ 0. 0. ]
# [ 0. 0. ]
# [-0.03888761 -0.06755984]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0. 0. ]
# [-0.0192222 -0.04169233]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0.01434913 0.03568212]
# [-0.11336883 -0.12873614]
# [ 0. 0. ]
# [-0.24496339 -0.24048163]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0.04088281 0.06565224]
# [-0.12784363 -0.13359821]
# [ 0.05618424 0.07396613]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0. 0. ]
# [ 0. -0.01719233]
# [ 0. 0. ]
# [ 0. 0. ]
# [-0.00076072 -0.03607186]
# [ 0.21801499 0.21146794]
# [-0.02161094 -0.04031265]
# [ 0.0918689 0.10487888]
# [ 0.0106154 0.03233612]
# [-0.07817317 -0.09725142]
# [ 0. 0. ]
# [ 0. 0. ]
# [-0.23725343 -0.24194022]
# [ 0. 0. ]
# [-0.08725718 -0.1048776 ]
# [ 0. 0. ]
# [ 0. 0. ]
# [-0.02114314 -0.04145789]
# [ 0. 0. ]
# [ 0. 0. ]
# [-0.02710908 -0.04590397]
# [ 0.15293184 0.15415154]
# [ 0.2114463 0.2088728 ]
# [-0.10969634 -0.12368613]
# [ 0. -0.01505797]
# [-0.01140458 -0.03234904]
# [ 0.16051085 0.1680062 ]
# [ 0.09816848 0.11094204]
```
#### References
[1]: Jerome Friedman, Trevor Hastie and Rob Tibshirani. Regularization Paths
for Generalized Linear Models via Coordinate Descent. _Journal of
Statistical Software_, 33(1), 2010.
https://www.jstatsoft.org/article/view/v033i01/v33i01.pdf
[2]: Guo-Xun Yuan, Chia-Hua Ho and Chih-Jen Lin. An Improved GLMNET for
L1-regularized Logistic Regression. _Journal of Machine Learning
Research_, 13, 2012.
http://www.jmlr.org/papers/volume13/yuan12a/yuan12a.pdf
"""
with tf.name_scope(name or 'fit_sparse'):
# TODO(b/111922388): Include dispersion and offset parameters.
def _grad_neg_log_likelihood_and_fim_fn(x):
predicted_linear_response = (
fisher_scoring.compute_predicted_linear_response(
model_matrix, x))
g, h_middle = _grad_neg_log_likelihood_and_fim(
model_matrix, predicted_linear_response, response, model)
return g, model_matrix, h_middle
return proximal_hessian_sparse.minimize(
_grad_neg_log_likelihood_and_fim_fn,
x_start=model_coefficients_start,
l1_regularizer=l1_regularizer,
l2_regularizer=l2_regularizer,
maximum_iterations=maximum_iterations,
maximum_full_sweeps_per_iteration=maximum_full_sweeps_per_iteration,
learning_rate=learning_rate,
tolerance=tolerance,
name=name)
def _fit_sparse_exact_hessian( # pylint: disable = missing-docstring
model_matrix,
response,
model,
model_coefficients_start,
tolerance,
l1_regularizer,
l2_regularizer=None,
maximum_iterations=1,
maximum_full_sweeps_per_iteration=1,
learning_rate=None,
name=None):
with tf.name_scope(name or 'fit_sparse_exact_hessian'):
# TODO(b/111922388): Include dispersion and offset parameters.
def _neg_log_likelihood(x):
predicted_linear_response = (
fisher_scoring.compute_predicted_linear_response(model_matrix, x))
log_probs = model.log_prob(response, predicted_linear_response)
return -log_probs
def _grad_and_hessian_loss_fn(x):
loss = _neg_log_likelihood(x)
grad_loss = tf.gradients(ys=loss, xs=[x])[0]
hessian_loss = tf.hessians(ys=loss, xs=[x])[0]
hessian_chol = tf.linalg.cholesky(hessian_loss)
return grad_loss, hessian_chol, tf.ones_like(grad_loss)
return proximal_hessian_sparse.minimize(
_grad_and_hessian_loss_fn,
x_start=model_coefficients_start,
l1_regularizer=l1_regularizer,
l2_regularizer=l2_regularizer,
maximum_iterations=maximum_iterations,
maximum_full_sweeps_per_iteration=maximum_full_sweeps_per_iteration,
learning_rate=learning_rate,
tolerance=tolerance,
name=name)