Source code for TransportMaps.LaplaceApproximationRoutines

#
# This file is part of TransportMaps.
#
# TransportMaps is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# TransportMaps is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with TransportMaps.  If not, see <http://www.gnu.org/licenses/>.
#
# Transport Maps Library
# Copyright (C) 2015-2018 Massachusetts Institute of Technology
# Uncertainty Quantification group
# Department of Aeronautics and Astronautics
#
# Authors: Transport Map Team
# Website: transportmaps.mit.edu
# Support: transportmaps.mit.edu/qa/
#
from typing import Union

import numpy as np
import numpy.linalg as npla
import scipy.optimize as sciopt

from .Misc import logger
from .DerivativesChecks import fd
from .Distributions.FrozenDistributions import \
    StandardNormalDistribution, NormalDistribution
from .Distributions.Deprecated import GaussianDistribution
from .Distributions.Inference.InferenceBase import \
    BayesPosteriorDistribution
from .Likelihoods.LikelihoodBase import \
    AdditiveLogLikelihood, IndependentLogLikelihood
from .RandomizedLinearAlgebra.RandomizedLinearAlgebra import \
    randomized_direct_eig, randomized_direct_svd

__all__ = [
    'laplace_approximation','laplace_approximation_withBounds',
]

nax = np.newaxis


def is_normal(pi):
    return issubclass(type(pi.prior), NormalDistribution) or \
        issubclass(type(pi.prior), GaussianDistribution) or \
        issubclass(type(pi.prior), StandardNormalDistribution)


[docs]def laplace_approximation(
        pi,
        params: Union[dict, None] = None,
        x0=None,
        tol=1e-5,
        ders=2,
        fungrad=False,
        hessact=False,
        hess_approx='low-rank',
        hess_fd_eps=1e-6,
        low_rank_rnd_eps=1e-5,
        low_rank_rnd_ovsamp=10,
        low_rank_rnd_pow_n=0
):
    r""" Compute the Laplace approximation of the distribution :math:`\pi`.

    Args:
      pi (Distribution): distribution :math:`\pi`
      params (dict): parameters for distribution :math:`\pi`
      tol (float): tolerance to be used to solve the maximization problem.
      ders (int): order of derivatives available for the solution of the
        optimization problem. 0 -> derivative free, 1 -> gradient, 2 -> hessian.
      fungrad (bool): whether the distribution :math:`\pi` provide the method
        :func:`Distribution.tuple_grad_x_log_pdf` computing the evaluation and the
        gradient in one step. This is used only for ``ders>=1``.
      hessact (bool): whether the distribution :math:`\pi` provides the method
        :func:`Distribution.action_hess_x_log_pdf` computing the action of the
        Hessian on a vector. This is used only for ``ders==2``
      hess_approx (str): whether to compute a finite difference Hessian ``fd``,
        or a low-rank approximation of it ``low-rank``. This is used only if ``ders==1``.
      hess_fd_eps (float): tolerance for finite difference Hessian
      low_rank_rnd_eps (float): tolerance to be used in the pursue of a randomized
        low-rank approximation of the prior preconditioned Hessian of the log-likelihood
      low_rank_rnd_pow_n (int): number of power iterations to be used in the pursue of a randomized
        low-rank approximation of the prior preconditioned Hessian of the log-likelihood
      low_rank_rnd_ovsamp (int): oversampling to be used in the pursue of a randomized
        low-rank approximation of the prior preconditioned Hessian of the log-likelihood

    Returns:
      (:class:`NormalDistribution`) -- Laplace approximation
    """
    # Minimize :math:`-\log \pi`
    if params is None:
        params = {}
    params['nobj'] = 0
    params['gx_nobj'] = 0
    params['hx_nobj'] = 0
    def objective(x, params, cache=None):
        reset_cache(cache)
        params['nobj'] += 1
        out = - pi.log_pdf(x[nax,:], cache=cache)[0]
        logger.info("Lap. Obj. Eval. %d - f val. = %.10e" % (params['nobj'], out))
        return out
    def dx_objective(x, params, cache=None):
        params['gx_nobj'] += 1
        out = - pi.grad_x_log_pdf(x[nax,:], cache=cache)[0,:]
        logger.info(
            "Lap. Grad. Obj. Eval. %d " % params['nobj'] + \
            "- ||grad f|| = %.10e" % npla.norm(out))
        return out
    def tuple_dx_objective(x, params, cache=None):
        reset_cache(cache)
        params['nobj'] += 1
        params['gx_nobj'] += 1
        ev, gx = pi.tuple_grad_x_log_pdf(x[nax,:], cache=cache)
        ev = -ev[0]
        gx = -gx[0,:]
        logger.info("Lap. Obj. Eval. %d - f val. = %.10e" % (params['nobj'], ev))
        logger.info(
            "Lap. Grad. Obj. Eval. %d " % params['nobj'] + \
            "- ||grad f|| = %.10e" % npla.norm(gx))
        return ev, gx
    def dx2_objective(x, params, cache=None):
        params['hx_nobj'] += 1
        out = - pi.hess_x_log_pdf(x[nax,:], cache=cache)[0,:,:]
        logger.info(
            "Lap. Hess. Obj. Eval. %d " % params['hx_nobj'])
        return out
    def action_dx2_objective(x, dx, params, cache=None):
        params['hx_nobj'] += 1
        out = - pi.action_hess_x_log_pdf(x[nax,:], dx[nax,:], cache=cache)[0,:]
        logger.info(
            "Lap. Action Hess. Obj. Eval. %d " % params['hx_nobj'])
        return out
    def reset_cache(cache):
        if cache is not None:
            tot_size = cache['tot_size']
            cache.clear()
            cache['tot_size'] = tot_size
    if x0 is None:
        if issubclass(type(pi), BayesPosteriorDistribution):
            try:
                x0 = pi.prior.rvs(1).flatten()
            except NotImplementedError:
                logger.warning(
                    "laplace_approximation(): " + \
                    "Sampling from the prior is not implemented. " + \
                    "Initial conditions set to zero.")
                x0 = np.zeros(pi.dim)
        else:
            x0 = np.zeros(pi.dim) # Random or zero starting point? Or input argument?
    options = {'maxiter': 10000,
               'disp': False}

    cache = {'tot_size': 1}
    args = (params, cache)

    if ders >= 1:
        if fungrad:
            fun = tuple_dx_objective
            jac = True
        else:
            fun = objective
            jac = dx_objective
    
    if ders == 0:
        res = sciopt.minimize(
            objective, args=args, x0=x0,
            method='BFGS', tol=tol, options=options)
    elif ders == 1:
        res = sciopt.minimize(
            fun, args=args, x0=x0, jac=jac,
            method='BFGS', tol=tol, options=options)
    elif ders == 2:
        if hessact:
            res = sciopt.minimize(
                fun, args=args, x0=x0, jac=jac,
                hessp=action_dx2_objective,
                method='newton-cg', tol=tol, options=options)
        else:
            res = sciopt.minimize(
                fun, args=args, x0=x0, jac=jac,
                hess=dx2_objective, method='newton-cg',
                tol=tol, options=options)
    else:
        raise ValueError("ders parameter not valid. Chose between 0,1,2.")
    # Log
    if res['success']:
        logger.info("Optimization terminated successfully")
    else:
        logger.info("Optimization failed.")
        logger.info("Message: %s" % res['message'])
    logger.info("  Function value:          %6f" % res['fun'])
    if ders >= 1:
        logger.info(
            "  Jacobian: " + \
            "2-norm: %e " % npla.norm(res['jac'], 2) + \
            "inf-norm: %e" % npla.norm(res['jac'], np.inf)
        )
    logger.info("  Number of iterations:    %6d" % res['nit'])
    logger.info("  N. function evaluations: %6d" % res['nfev'])
    if ders >= 1:
        logger.info("  N. Jacobian evaluations: %6d" % res['njev'])
    if ders >= 2:
        logger.info("  N. Hessian evaluations:  %6d" % res['nhev'])
        
    # Set MAP point
    x_map = res['x']
    # Compute the Hessian at the maximizer
    if ders > 0:
        if ders == 2:
            if not hessact:
                hess_map = - pi.hess_x_log_pdf( x_map[nax,:] )[0,:,:]
                d_laplace = NormalDistribution(x_map, precision=hess_map)
            elif issubclass(type(pi), BayesPosteriorDistribution) and is_normal(pi):
                logger.info("Building low-rank approximation of the Hessian " + \
                            "from action of the Hessian")
                # Low-rank Hessian from action of Hessian
                sqrt = bayes_low_rank_sqrt_approximation_hessact(
                    x_map[nax,:], pi, low_rank_rnd_eps, low_rank_rnd_ovsamp,
                    power_n=low_rank_rnd_pow_n)
                d_laplace = NormalDistribution(x_map, square_root_covariance=sqrt)
        elif ders==1 and hess_approx == 'low-rank' and \
             issubclass(type(pi), BayesPosteriorDistribution) and is_normal(pi) and \
             (
                 ( issubclass(type(pi.logL), AdditiveLogLikelihood) and is_normal(pi) ) or \
                 (
                     issubclass(type(pi.logL), IndependentLogLikelihood) and \
                     all(
                         ( issubclass(type(ll), AdditiveLogLikelihood) and is_normal(pi) ) for \
                         ll in pi.logL.factors
                     )
                 )
             ):
            logger.info("Building low-rank approximation of the Hessian from gradients")
            # Construct low-rank approximation
            sqrt = bayes_low_rank_sqrt_approximation_outer_grad(
                x_map[nax,:], pi, low_rank_rnd_eps, low_rank_rnd_ovsamp,
                power_n=low_rank_rnd_pow_n)
            d_laplace = NormalDistribution(x_map, square_root_covariance=sqrt)
        else: # Finite difference
            logger.info("Building finite difference approximation of the Hessian")
            hess_map = - fd(pi.grad_x_log_pdf, x_map[nax,:], hess_fd_eps)[0,:,:]
            d_laplace = NormalDistribution(x_map, precision=hess_map)
    else:
        raise NotImplementedError("The finite difference Hessian is not implemented " + \
                                  "for ders==0.")
    return d_laplace


def bayes_low_rank_sqrt_approximation_outer_grad(x, pi, eps, r, power_n):
    def action(dx, gx_ll_list, factors, prior, dim, dim_obs):
        Y = np.zeros((dim, dx.shape[1]))
        start = 0    
        for gx_ll, (ll, xidxs) in zip(gx_ll_list, factors):
            stop = start + ll.y.shape[0]
            # Compute (\Gamma_{obs}^{-1/2} dx) for the current factor
            G = ll.pi.solve_square_root_covariance_transposed(dx[start:stop,:])
            # Compute ((\nabla_x G(x))^T \Gamma_{obs}^{-1/2} dx) for the current factors
            Y[xidxs,:] += np.einsum(
                'ij,i...->j...', gx_ll, G )
            # Y[xidxs,:] += np.einsum(
            #     '...ij,i...->j...',
            #     ll.T.grad_x(np.tile(x,(dx.shape[1],1))), G )
            start = stop
        # Compute (\Gamma_pr^{\top/2}(\nabla_x G(x))^\top \Gamma_{obs}^{-\top/2} dx))
        Y = prior.square_root_covariance.T.dot(Y)
        return Y
    def action_transpose(dx, gx_ll_list, factors, prior, dim, dim_obs):
        # Compute (\Gamma_pr^{1/2} dx)
        P = prior.square_root_covariance.dot(dx)
        Y = np.zeros((dim_obs, dx.shape[1]))
        start = 0
        for gx_ll, (ll, xidxs) in zip(gx_ll_list, factors):
            stop = start + ll.y.shape[0]
            # Compute ((\nabla_x G(x)) \Gamma_pr^{1/2} dx)
            GP = np.einsum(
                'ij,j...->i...', gx_ll, P)
            # GP = np.einsum(
            #     '...ij,j...->i...',
            #     ll.T.grad_x(np.tile(x,(dx.shape[1],1))), P)
            # Compute (\Gamma_{obs}^{-1/2} (\nabla_x G(x)) \Gamma_pr^{1/2} dx)
            Y[start:stop,:] = ll.pi.solve_square_root_covariance(GP)
            start = stop
        return Y
        
    if issubclass(type(pi.logL), AdditiveLogLikelihood):
        factors = [(pi.logL, list(range(pi.logL.dim_in)))]
    else:
        factors = pi.logL.factors
    dim_obs = sum( [ ll.y.shape[0] for ll,_ in factors ] )
    dim = pi.logL.dim_in
    gx_ll_list = [] # Precompute all the gradients of the log-likelihoods at x
    for ll, xidxs in factors:
        gx_ll_list.append( ll.T.grad_x(x[:,xidxs])[0,:,:] )
    kwargs = {'gx_ll_list': gx_ll_list, 'factors': factors, 'prior': pi.prior, \
              'dim': dim, 'dim_obs': dim_obs}
    K, S, Vt = randomized_direct_svd(
        action, action_transpose, dim_obs, dim, eps, r, power_n=power_n,
        kwargs=kwargs)
    D = S**2
    sqrt = np.dot(K * (1/np.sqrt(1+D) - 1)[np.newaxis,:], K.T) + \
           np.eye(pi.prior.dim)
    sqrt = pi.prior.square_root_covariance.dot( sqrt )
    return sqrt

def bayes_low_rank_sqrt_approximation_hessact(x, pi, eps, r, power_n):
    def action(dx, x, logL, prior):
        Y = prior.square_root_covariance.dot(dx)
        Y = - logL.action_hess_x(np.tile(x,(dx.shape[1],1)), Y.T)[:,0,:].T
        Y = prior.square_root_covariance.T.dot(Y)
        return Y
    kwargs = {'x': x, 'logL': pi.logL, 'prior': pi.prior}
    D, K = randomized_direct_eig(
        action, pi.dim, eps, r, power_n=power_n, kwargs=kwargs)
    sqrt = np.dot(K * (1/np.sqrt(1+D) - 1)[np.newaxis,:], K.T) + \
           np.eye(pi.prior.dim)
    sqrt = pi.prior.square_root_covariance.dot( sqrt )
    return sqrt
    # K = np.dot(pi.prior.square_root, K)
    # Gpos = pi.prior.sigma - np.dot(K * (D/(1+D))[np.newaxis,:], K.T)
    # return npla.cholesky(Gpos)

[docs]def laplace_approximation_withBounds(pi, params=None, tol=1e-5, ders=2, disp=True, bounds = None):
    r""" Compute the Laplace approximation of the distribution :math:`\pi`.

    Args:
      pi (Distribution): distribution :math:`\pi`
      params (dict): parameters for distribution :math:`\pi`
      tol (float): tolerance to be used to solve the maximization problem.
      ders (int): order of derivatives available for the solution of the
        optimization problem. 0 -> derivative free, 1 -> gradient, 2 -> hessian.
      disp (bool): whether to display output from optimizer.

    Returns:
      (:class:`NormalDistribution`) -- Laplace approximation
    """
    nax = np.newaxis
    # Minimize :math:`-\log \pi`
    def objective(x, params):
        return - pi.log_pdf(x[nax,:], params)[0]
    def dx_objective(x, params):
        return - pi.grad_x_log_pdf(x[nax,:], params)[0,:]
    def dx2_objective(x, params):
        return - pi.hess_x_log_pdf(x[nax,:], params)[0,:,:]
    x0 = np.zeros(pi.dim) # Random or zero starting point? Or input argument?
    options = {'maxiter': 10000,
               'disp': disp}
    if ders == 0:
        res = sciopt.minimize(objective, args=params,
                              x0=x0,
                              method='L-BFGS-B',
                              tol=tol,
                              options=options, bounds = bounds)
    elif ders == 1:
        res = sciopt.minimize(objective, args=params,
                              x0=x0,
                              jac=dx_objective,
                              method='L-BFGS-B',
                              tol=tol,
                              options=options, bounds = bounds)
    elif ders == 2:
        res = sciopt.minimize(objective, args=params,
                              x0=x0,
                              jac=dx_objective,
                              hess=dx2_objective,
                              method='TNC',
                              tol=tol,
                              options=options, bounds = bounds)
    else:
        raise ValueError("ders parameter not valid. Chose between 0,1,2.")
    x_map = res['x']
    # Compute the Hessian at the maximizer
    hess_map = - pi.hess_x_log_pdf( x_map[nax,:] )[0,:,:]
    # Define the Gaussian distribution/Laplace approximation
    d_laplace = NormalDistribution(x_map, precision=hess_map)
    return d_laplace