Different Instances of IterGP

IterGP defines a whole family of GP approximation methods, whose approximate posterior mean may converge at different speeds. However, its uncertainty always exactly quantifies the relative distance to the latent function, analogous to an exact Gaussian process. Here, we will investigate different instances of IterGP and how they compare.

[1]:
from IPython.display import HTML
from matplotlib import animation
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='svg'
[2]:
import sys

sys.path.insert(
    0, "../../../../experiments"
)  # Needed to import other GP approximations

import warnings

warnings.filterwarnings("ignore", category=RuntimeWarning)

import models
import probnum
from probnum import backend, linops, randvars
from probnum.linalg.solvers import ProbabilisticLinearSolver, information_ops
from probnum.randprocs import kernels, mean_fns
import pykeops

from itergp import GaussianProcess, datasets, methods
from itergp.methods import belief_updates, policies, stopping_criteria

pykeops.verbose = False

Uncertainty Decomposition

First, we’ll illustrate how the reduction in computational uncertainty varies between different IterGP instances.

[3]:
# Generate data
rng_state = backend.random.rng_state(42)

num_data = 10
input_shape = ()
output_shape = ()

rng_state, rng_state_data = backend.random.split(rng_state, num=2)
data = datasets.SyntheticDataset(
    rng_state=rng_state,
    size=(num_data, num_data),
    input_shape=input_shape,
    output_shape=output_shape,
    noise_var=0.1,
)
X = data.train.X
y = data.train.y
[4]:
# Model
mean_fn = mean_fns.Zero(input_shape=input_shape, output_shape=output_shape)
kernel = kernels.Matern(input_shape=input_shape, nu=1.5, lengthscale=0.15)
gp = GaussianProcess(mean_fn, kernel)

# Likelihood
sigma_sq = 0.1
noise = randvars.Normal(
    mean=backend.zeros(y.shape),
    cov=linops.Scaling(sigma_sq, shape=(num_data, num_data)),
)
[5]:
# Prediction
Xnew = backend.linspace(-1.5, 1.5, 1000)
gp_post_true = gp.condition_on_data(X, y, b=noise, approx_method=methods.Cholesky())
[6]:
# Approximation methods setup
pseudo_inputs = backend.asarray(
    [-1.0, -0.5, 0.25, 1.0, 0.0, 0.75, -0.75, -1.25, -0.25, 0.5]
)


class AdverserialSolver(ProbabilisticLinearSolver):
    def __init__(
        self,
        atol: float = 1e-5,
        rtol: float = 1e-5,
        maxiter: int = None,
    ):
        super().__init__(
            policy=policies.AdverserialPolicy(policies.UnitVectorPolicy()),
            information_op=information_ops.ProjectedResidualInformationOp(),
            belief_update=belief_updates.ProjectedResidualBeliefUpdate(),
            stopping_criterion=stopping_criteria.MaxIterationsStoppingCriterion(
                maxiter=maxiter,
            )
            | probnum.linalg.solvers.stopping_criteria.ResidualNormStoppingCriterion(
                atol=atol, rtol=rtol
            ),
        )


# Plot setup
fig, axs = plt.subplots(
    nrows=6,
    ncols=2,
    figsize=(14, 14),
    sharex=True,
    squeeze=False,
    gridspec_kw={"height_ratios": [2, 1, 2, 1, 2, 1]},
)
fig.patch.set_alpha(0.0)  # set figure background opacity to 0
plt.close()


def animate(i):

    for ax in axs.flatten():
        ax.cla()

    # Cholesky
    gp_post = gp.condition_on_data(
        X, y, b=noise, approx_method=methods.Cholesky(maxrank=i)
    )
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[0, 0]
    )
    axs[0, 0].set(ylim=[-2.75, 2.75], ylabel="$y$", title="Cholesky")

    axs[1, 0].fill_between(
        x=Xnew,
        y1=backend.zeros_like(Xnew),
        y2=gp_post_true.var(Xnew),
        lw=0.0,
        alpha=0.4,
        color="C0",
    )
    axs[1, 0].fill_between(
        x=Xnew,
        y1=gp_post_true.var(Xnew),
        y2=gp_post.var(Xnew),
        lw=0.25,
        alpha=0.4,
        color="C2",
    )
    axs[1, 0].set(xlabel="$x$", ylabel="Variance")
    axs[1, 0].set_ylim(bottom=0)

    # Pseudo-Input
    pseudo_input_method = methods.PseudoInput(pseudo_inputs=pseudo_inputs[0:i])
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=pseudo_input_method)
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[0, 1]
    )
    axs[0, 1].set(ylabel="$y$", title="Pseudo-Input")

    data_range = X.max() - X.min()
    data_width = 0.015 * data_range
    for j in [0, 1]:
        for z in pseudo_input_method.pseudo_inputs:
            axs[j, 1].axvspan(
                xmin=z - 0.5 * data_width,
                xmax=z + 0.5 * data_width,
                color="black",
                alpha=0.2,
                zorder=-10,
                lw=0.0,
            )

    axs[1, 1].fill_between(
        x=Xnew,
        y1=backend.zeros_like(Xnew),
        y2=gp_post_true.var(Xnew),
        lw=0.0,
        alpha=0.4,
        color="C0",
    )
    axs[1, 1].fill_between(
        x=Xnew,
        y1=gp_post_true.var(Xnew),
        y2=gp_post.var(Xnew),
        lw=0.25,
        alpha=0.4,
        color="C2",
    )
    axs[1, 1].set(xlabel="$x$", ylabel="Variance")
    axs[1, 1].set_ylim(bottom=0)

    # Conjugate Gradient
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=methods.CG(maxiter=i))
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[2, 0]
    )
    axs[2, 0].set(ylim=[-2.75, 2.75], ylabel="$y$", title="CG")

    axs[3, 0].fill_between(
        x=Xnew,
        y1=backend.zeros_like(Xnew),
        y2=gp_post_true.var(Xnew),
        lw=0.0,
        alpha=0.4,
        color="C0",
    )
    axs[3, 0].fill_between(
        x=Xnew,
        y1=gp_post_true.var(Xnew),
        y2=gp_post.var(Xnew),
        lw=0.25,
        alpha=0.4,
        color="C2",
    )
    axs[3, 0].set(xlabel="$x$", ylabel="Variance")
    axs[3, 0].set_ylim(bottom=0)

    # Spectral (descending)
    gp_post = gp.condition_on_data(
        X, y, b=noise, approx_method=methods.PBR(maxiter=i, descending=True)
    )
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[2, 1]
    )
    axs[2, 1].set(ylim=[-2.75, 2.75], ylabel="$y$", title="Spectral (Descending)")

    axs[3, 1].fill_between(
        x=Xnew,
        y1=backend.zeros_like(Xnew),
        y2=gp_post_true.var(Xnew),
        lw=0.0,
        alpha=0.4,
        color="C0",
    )
    axs[3, 1].fill_between(
        x=Xnew,
        y1=gp_post_true.var(Xnew),
        y2=gp_post.var(Xnew),
        lw=0.25,
        alpha=0.4,
        color="C2",
    )
    axs[3, 1].set(xlabel="$x$", ylabel="Variance")
    axs[3, 1].set_ylim(bottom=0)

    # Spectral (ascending)
    gp_post = gp.condition_on_data(
        X, y, b=noise, approx_method=methods.PBR(maxiter=i, descending=False)
    )
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[4, 1]
    )
    axs[4, 1].set(ylim=[-2.75, 2.75], ylabel="$y$", title="Spectral (Ascending)")

    axs[5, 1].fill_between(
        x=Xnew,
        y1=backend.zeros_like(Xnew),
        y2=gp_post_true.var(Xnew),
        lw=0.0,
        alpha=0.4,
        color="C0",
    )
    axs[5, 1].fill_between(
        x=Xnew,
        y1=gp_post_true.var(Xnew),
        y2=gp_post.var(Xnew),
        lw=0.25,
        alpha=0.4,
        color="C2",
    )
    axs[5, 1].set(xlabel="$x$", ylabel="Variance")
    axs[5, 1].set_ylim(bottom=0)

    # Spectral (ascending)
    gp_post = gp.condition_on_data(
        X, y, b=noise, approx_method=AdverserialSolver(maxiter=i)
    )
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[4, 0]
    )
    axs[4, 0].set(ylim=[-2.75, 2.75], ylabel="$y$", title="Adverserial")

    axs[5, 0].fill_between(
        x=Xnew,
        y1=backend.zeros_like(Xnew),
        y2=gp_post_true.var(Xnew),
        lw=0.0,
        alpha=0.4,
        color="C0",
    )
    axs[5, 0].fill_between(
        x=Xnew,
        y1=gp_post_true.var(Xnew),
        y2=gp_post.var(Xnew),
        lw=0.25,
        alpha=0.4,
        color="C2",
    )
    axs[5, 0].set(xlabel="$x$", ylabel="Variance")
    axs[5, 0].set_ylim(bottom=0)

    # True GP Posterior
    for m in range(axs.shape[0]):
        if m % 2 == 0:
            for j in range(axs.shape[1]):
                axs[m, j].plot(
                    Xnew,
                    gp_post_true.mean(Xnew),
                    lw=1,
                    color="black",
                    zorder=-1,
                    label="True GP mean",
                )
                axs[m, j].plot(
                    Xnew,
                    gp_post_true.mean(Xnew) + 2 * gp_post_true.std(Xnew),
                    linestyle="--",
                    lw=1,
                    color="black",
                    zorder=-1,
                )
                axs[m, j].plot(
                    Xnew,
                    gp_post_true.mean(Xnew) - 2 * gp_post_true.std(Xnew),
                    linestyle="--",
                    lw=1,
                    color="black",
                    zorder=-1,
                )

    fig.suptitle(f"Iteration {i}")
    fig.tight_layout()


from IPython.display import HTML
from matplotlib import animation

# Create animation
anim = animation.FuncAnimation(
    fig, func=animate, frames=num_data + 1, interval=1250, repeat_delay=4000, blit=False
)

# Create interactive plot
HTML(anim.to_jshtml())
[6]:

Comparison of Different Approximation Methods

We can define a (quadratically expensive) IterGP policy that is very similar in its posterior mean to SVGP and Nyström. To better understand the difference (for a fixed set of inducing points), let’s compare to these classic inducing point methods.

[7]:
# Plot setup
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(14, 14), sharex=True, sharey=True)
fig.patch.set_alpha(0.0)  # set figure background opacity to 0
plt.close()


def animate(i):

    for ax in axs.flatten():
        ax.cla()

    # Cholesky
    gp_post = gp.condition_on_data(
        X, y, b=noise, approx_method=methods.Cholesky(maxrank=i)
    )
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[0, 0]
    )
    axs[0, 0].set(ylim=[-2.75, 2.75], ylabel="$y$", title="Cholesky")

    # CG
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=methods.CG(maxiter=i))
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[0, 1]
    )
    axs[0, 1].set(ylabel="$y$", title="CG")

    # Pseudo-Input
    pseudo_input_method = methods.PseudoInput(pseudo_inputs=pseudo_inputs[0:i])
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=pseudo_input_method)
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[1, 0]
    )
    axs[1, 0].set(ylabel="$y$", title="Pseudo-Input")

    data_range = X.max() - X.min()
    data_width = 0.015 * data_range
    for z in pseudo_input_method.pseudo_inputs:
        axs[1, 0].axvspan(
            xmin=z - 0.5 * data_width,
            xmax=z + 0.5 * data_width,
            color="black",
            alpha=0.2,
            zorder=-10,
            lw=0.0,
        )

    # Projected Bayes regressor
    pbr_method = methods.PBR(maxiter=i, descending=True)
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=pbr_method)

    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[1, 1]
    )
    axs[1, 1].set(ylabel="$y$", title="Projected Bayes Regressor")

    # SVGP
    svgp = models.SVGP(prior=gp, X=X, Y=y, b=noise, inducing_points=pseudo_inputs[0:i])
    svgp.plot(X=Xnew, data=(X, y), ax=axs[2, 0])
    axs[2, 0].set(ylabel="$y$", title="SVGP")

    for z in pseudo_input_method.pseudo_inputs:
        axs[2, 0].axvspan(
            xmin=z - 0.5 * data_width,
            xmax=z + 0.5 * data_width,
            color="black",
            alpha=0.2,
            zorder=-10,
            lw=0.0,
        )

    # Nyström
    nygp = models.NystroemGP(
        prior=gp, X=X, Y=y, b=noise, inducing_points=pseudo_inputs[0:i]
    )
    nygp.plot(X=Xnew, data=(X, y), ax=axs[2, 1])
    axs[2, 1].set(ylabel="$y$", title="Nyström / SoR")

    for z in pseudo_input_method.pseudo_inputs:
        axs[2, 1].axvspan(
            xmin=z - 0.5 * data_width,
            xmax=z + 0.5 * data_width,
            color="black",
            alpha=0.2,
            zorder=-10,
            lw=0.0,
        )

    # Mixed strategy (PseudoInput + CG)
    mixed_method = methods.MixedStrategy(
        base_policies=(
            policies.PseudoInputPolicy(pseudo_inputs=pseudo_inputs[0:i]),
            policies.GradientPolicy(),
        ),
        iters=(4,),
        maxiter=i,
    )
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=mixed_method)
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[3, 0]
    )
    axs[3, 0].set(ylabel="$y$", title="Mixed (PseudoInput + CG)")

    for z in pseudo_input_method.pseudo_inputs:
        axs[3, 0].axvspan(
            xmin=z - 0.5 * data_width,
            xmax=z + 0.5 * data_width,
            color="black",
            alpha=0.2,
            zorder=-10,
            lw=0.0,
        )

    # Mixed strategy (Cholesky + CG)
    mixed_method = methods.MixedStrategy(
        base_policies=(policies.UnitVectorPolicy(), policies.GradientPolicy()),
        iters=(4,),
        maxiter=i,
    )
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=mixed_method)
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[3, 1]
    )
    axs[3, 1].set(ylabel="$y$", title="Mixed (Cholesky + CG)")

    # Mixed strategy (CG + PBR)
    mixed_method = methods.MixedStrategy(
        base_policies=(policies.GradientPolicy(), policies.EigenvectorPolicy()),
        iters=(4,),
        maxiter=i,
    )
    gp_post = gp.condition_on_data(X, y, b=noise, approx_method=mixed_method)
    gp_post.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=True, ax=axs[4, 0]
    )
    axs[4, 0].set(xlabel="$X$", ylabel="$y$", title="Mixed (CG + PBR)")

    # Exact
    gp_post_true.plot(
        X=Xnew, data=(X, y), samples=0, computational_predictive=False, ax=axs[4, 1]
    )
    axs[4, 1].set(xlabel="$X$", ylabel="$y$", title="Mathematical Posterior")

    # True GP Posterior
    for m in range(axs.shape[0]):
        for j in range(axs.shape[1]):
            axs[m, j].plot(
                Xnew,
                gp_post_true.mean(Xnew),
                lw=1,
                color="black",
                zorder=-1,
                label="True GP mean",
            )
            axs[m, j].plot(
                Xnew,
                gp_post_true.mean(Xnew) + 2 * gp_post_true.std(Xnew),
                linestyle="--",
                lw=1,
                color="black",
                zorder=-1,
            )
            axs[m, j].plot(
                Xnew,
                gp_post_true.mean(Xnew) - 2 * gp_post_true.std(Xnew),
                linestyle="--",
                lw=1,
                color="black",
                zorder=-1,
            )

    fig.suptitle(f"Iteration {i}")
    fig.tight_layout()


from IPython.display import HTML
from matplotlib import animation

# Create animation
anim = animation.FuncAnimation(
    fig, func=animate, frames=num_data + 1, interval=1250, repeat_delay=4000, blit=False
)

# Create interactive plot
HTML(anim.to_jshtml())
[7]:
[ ]: