DOC clarify the kernel gradient for GaussianProcesses (scikit-learn#18115)

rauwuckl · web-flow · commit eb7b15817e23 · 2020-08-16T09:54:18.000+10:00
diff --git a/doc/modules/gaussian_process.rst b/doc/modules/gaussian_process.rst
@@ -385,12 +385,14 @@ equivalent call to ``__call__``: ``np.diag(k(X, X)) == k.diag(X)``
 Kernels are parameterized by a vector :math:`\theta` of hyperparameters. These
 hyperparameters can for instance control length-scales or periodicity of a
 kernel (see below). All kernels support computing analytic gradients 
-of the kernel's auto-covariance with respect to :math:`\theta` via setting
-``eval_gradient=True`` in the ``__call__`` method. This gradient is used by the
-Gaussian process (both regressor and classifier) in computing the gradient
-of the log-marginal-likelihood, which in turn is used to determine the
-value of :math:`\theta`, which maximizes the log-marginal-likelihood,  via
-gradient ascent. For each hyperparameter, the initial value and the
+of the kernel's auto-covariance with respect to :math:`log(\theta)` via setting
+``eval_gradient=True`` in the ``__call__`` method.
+That is, a ``(len(X), len(X), len(theta))`` array is returned where the entry
+``[i, j, l]`` contains :math:`\frac{\partial k_\theta(x_i, x_j)}{\partial log(\theta_l)}`.
+This gradient is used by the Gaussian process (both regressor and classifier)
+in computing the gradient of the log-marginal-likelihood, which in turn is used
+to determine the value of :math:`\theta`, which maximizes the log-marginal-likelihood,
+via gradient ascent. For each hyperparameter, the initial value and the
 bounds need to be specified when creating an instance of the kernel. The
 current value of :math:`\theta` can be get and set via the property
 ``theta`` of the kernel object. Moreover, the bounds of the hyperparameters can be
diff --git a/sklearn/gaussian_process/kernels.py b/sklearn/gaussian_process/kernels.py
@@ -572,8 +572,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined.
+            Determines whether the gradient with respect to the log of the
+            kernel hyperparameter is computed.
 
         Returns
         -------
@@ -582,7 +582,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape \
                 (n_samples_X, n_samples_X, n_dims, n_kernels), optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -796,8 +796,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
 
         Returns
         -------
@@ -806,7 +806,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -894,8 +894,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
 
         Returns
         -------
@@ -904,7 +904,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -1072,8 +1072,8 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
 
         Returns
         -------
@@ -1082,7 +1082,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -1200,8 +1200,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
@@ -1210,7 +1211,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
             optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -1319,8 +1320,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             is evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
@@ -1329,7 +1331,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
             optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -1466,8 +1468,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             if evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
@@ -1476,7 +1479,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -1620,8 +1623,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             if evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
@@ -1630,7 +1634,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -1809,16 +1813,17 @@ def __call__(self, X, Y=None, eval_gradient=False):
             if evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
         K : ndarray of shape (n_samples_X, n_samples_Y)
             Kernel k(X, Y)
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims)
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when eval_gradient
             is True.
         """
@@ -1954,8 +1959,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             if evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
@@ -1964,7 +1970,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims), \
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -2086,8 +2092,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             if evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
@@ -2096,7 +2103,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """
@@ -2240,8 +2247,9 @@ def __call__(self, X, Y=None, eval_gradient=False):
             if evaluated instead.
 
         eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the kernel
-            hyperparameter is determined. Only supported when Y is None.
+            Determines whether the gradient with respect to the log of
+            the kernel hyperparameter is computed.
+            Only supported when Y is None.
 
         Returns
         -------
@@ -2250,7 +2258,7 @@ def __call__(self, X, Y=None, eval_gradient=False):
 
         K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
                 optional
-            The gradient of the kernel k(X, X) with respect to the
+            The gradient of the kernel k(X, X) with respect to the log of the
             hyperparameter of the kernel. Only returned when `eval_gradient`
             is True.
         """