Source code for kdelearn.bandwidth_selection

from typing import Optional

import numpy as np
from numpy import ndarray

from .cutils import isdd  # , compute_unbiased_kde

# from scipy.optimize import Bounds, minimize, newton


kernel_properties = {
    "gaussian": (1 / (2 * np.sqrt(np.pi)), 1),
    "uniform": (0.5, 1 / 3),
    "epanechnikov": (0.6, 0.2),
    "cauchy": (5 / (4 * np.pi), 1),
}


[docs]def normal_reference(
    x_train: ndarray,
    weights_train: Optional[ndarray] = None,
    kernel_name: str = "gaussian",
) -> ndarray:
    """AMISE-optimal bandwidth for the (assuming) gaussian density.

    See paragraph (3.2.1) in [1].

    Parameters
    ----------
    x_train : ndarray of shape (m_train, n)
        Data points containing data with float type.
    weights_train : ndarray of shape (m_train,), optional
        Weights of data points. If None, all data points are equally weighted.
    kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian'
        Name of kernel function.

    Returns
    -------
    bandwidth : ndarray of shape (n,)
        Smoothing parameter for scaling the estimator.

    Examples
    --------
    >>> x_train = np.random.normal(0, 1, size=(100, 1))
    >>> bandwidth = normal_reference(x_train, kernel_name="gaussian")

    References
    ----------
    [1] Wand, M. P. and Jones, M. C. Kernel Smoothing. Chapman and Hall, 1995.
    """
    if x_train.ndim != 2:
        raise ValueError("invalid shape of 'x_train' - should be 2d")
    m_train = x_train.shape[0]

    if weights_train is None:
        weights_train = np.full(m_train, 1 / m_train)
    else:
        if weights_train.ndim != 1:
            raise ValueError("invalid shape of 'weights_train' - should be 1d")
        if weights_train.shape[0] != x_train.shape[0]:
            raise ValueError("invalid size of 'weights_train'")
        if not (weights_train >= 0).all():
            raise ValueError("'weights_train' should be non negative")
        weights_train = weights_train / weights_train.sum()

    if kernel_name not in kernel_properties:
        available_kernels = list(kernel_properties.keys())
        raise ValueError(f"invalid 'kernel_name' - choose one of {available_kernels}")

    m_train = x_train.shape[0]
    # Unbiased weighted standard deviation
    x_mean = np.average(x_train, weights=weights_train, axis=0)
    x_var = np.average((x_train - x_mean) ** 2, weights=weights_train, axis=0)
    weighted_std_x = np.sqrt(m_train / (m_train - 1) * x_var)

    wk, uk = kernel_properties[kernel_name]
    zf = 3 / (8 * np.sqrt(np.pi) * weighted_std_x**5)

    bandwidth = (wk / (uk**2 * zf * m_train)) ** 0.2
    return bandwidth


[docs]def direct_plugin(
    x_train: ndarray,
    weights_train: Optional[ndarray] = None,
    kernel_name: str = "gaussian",
    stage: int = 2,
):
    """Direct plug-in method with gaussian kernel used in estimation of integrated
    squared density derivatives limited to maximum value of `stage` equal to 3.

    See paragraph (3.6.1) in [1].

    Parameters
    ----------
    x_train : ndarray of shape (m_train, n)
        Data points containing data with float type.
    weights_train : ndarray of shape (m_train,), optional
        Weights of data points. If None, all data points are equally weighted.
    kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian'
        Name of kernel function.
    stage : int, default=2
        Depth of plugging-in (max 3).

    Returns
    -------
    bandwidth : ndarray of shape (n,)
        Smoothing parameter for scaling the estimator.

    Examples
    --------
    >>> x_train = np.random.normal(0, 1, size=(100, 1))
    >>> bandwidth = direct_plugin(x_train, kernel_name="gaussian", stage=2)

    References
    ----------
    [1] Wand, M. P. and Jones, M. C. Kernel Smoothing. Chapman and Hall, 1995.
    """
    if x_train.ndim != 2:
        raise ValueError("invalid shape of 'x_train' - should be 2d")
    m_train = x_train.shape[0]

    if weights_train is None:
        weights_train = np.full(m_train, 1 / m_train)
    else:
        if weights_train.ndim != 1:
            raise ValueError("invalid shape of 'weights_train' - should be 1d")
        if weights_train.shape[0] != x_train.shape[0]:
            raise ValueError("invalid size of 'weights_train'")
        if not (weights_train >= 0).all():
            raise ValueError("'weights_train' should be non negative")
        weights_train = weights_train / weights_train.sum()

    if kernel_name not in kernel_properties:
        available_kernels = list(kernel_properties.keys())
        raise ValueError(f"invalid 'kernel_name' - choose one of {available_kernels}")

    if not isinstance(stage, int):
        raise ValueError("invalid type of 'stage' - should be of an int type")

    if stage < 0 or stage > 3:
        raise ValueError("invalid 'stage' - should be greater than 0 and less than 4")

    m_train = x_train.shape[0]

    # Unbiased weighted standard deviation
    x_mean = np.average(x_train, weights=weights_train, axis=0)
    x_var = np.average((x_train - x_mean) ** 2, weights=weights_train, axis=0)
    weighted_std_x = np.sqrt(m_train / (m_train - 1) * x_var)
    wk, uk = kernel_properties[kernel_name]

    def _psi(r):
        n = (-1) ** (0.5 * r) * np.math.factorial(r)
        tmp = np.math.factorial(int(0.5 * r)) * np.sqrt(np.pi)
        d = (2 * weighted_std_x) ** (r + 1) * tmp
        return n / d

    def _bw(gd, zf, b):
        # There is hidden uk variable in denominator (equal to 1 for gaussian kernel)
        return (-2 * gd / (zf * m_train)) ** (1 / (b + 1))

    # Gaussian derivatives of order 4, 6 and 8 at zero arg
    gds = {"gd8": 41.888939, "gd6": -5.984134, "gd4": 1.196827}

    r = 2 * stage + 4
    zf = _psi(r)
    while r != 4:
        r -= 2
        gd_at_zero = gds[f"gd{r}"]
        bw = _bw(gd_at_zero, zf, r + 2)
        zf = isdd(x_train, bw, r)

    bandwidth = (wk / (uk**2 * zf * m_train)) ** 0.2
    return bandwidth


# def ste_plugin(
#     x_train: ndarray,
#     weights_train: Optional[ndarray] = None,
#     kernel_name: str = "gaussian",
# ):
#     """Solve-the-equation plug-in method.
#
#     See paragraph (3.6.2) in [1].
#
#     Parameters
#     ----------
#     x_train : ndarray of shape (m_train, n)
#         Data points as an array containing data with float type.
#     weights_train : ndarray of shape (m_train,), optional
#         Weights of data points. If None, all points are equally weighted.
#     kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian'  # noqa
#         Name of kernel function.
#
#     Returns
#     -------
#     bandwidth : ndarray of shape (n,)
#         Smoothing parameter.
#
#     Examples
#     --------
#     >>> x_train = np.random.normal(0, 1, size=(100, 1))
#     >>> bandwidth = ste_plugin(x_train, kernel_name="gaussian")
#
#     References
#     ----------
#     [1] Wand, M. P. and Jones, M. C. Kernel Smoothing. Chapman and Hall, 1995.
#     """
#     if x_train.ndim != 2:
#         raise ValueError("invalid shape of 'x_train' - should be 2d")
#
#     if kernel_name not in kernel_properties:
#         available_kernels = list(kernel_properties.keys())
#         raise ValueError(f"invalid 'kernel_name' - try one of {available_kernels}")
#
#     m_train = x_train.shape[0]
#     if weights_train is None:
#         weights_train = np.full(m_train, 1 / m_train)
#
#     # Unbiased weighted standard deviation
#     x_mean = np.average(x_train, weights=weights_train, axis=0)
#     x_var = np.average((x_train - x_mean) ** 2, weights=weights_train, axis=0)
#     weighted_std_x = np.sqrt(m_train / (m_train - 1) * x_var)
#     wk, uk = kernel_properties[kernel_name]
#
#     def eq(h):
#         a = 0.920 * weighted_std_x * m_train ** (-1 / 7)
#         b = 0.912 * weighted_std_x * m_train ** (-1 / 9)
#         sda = isdd(x_train, weights_train, a, 4)
#         tdb = -isdd(x_train, weights_train, b, 6)
#
#         alpha2 = 1.357 * (sda / tdb) ** (1 / 7) * h ** (5 / 7)
#         sdalpha2 = isdd(x_train, weights_train, alpha2, 4)
#         return (wk / (uk**2 * sdalpha2 * m_train)) ** 0.2 - h
#
#     # Solve the equation using secant method
#     bandwidth0 = normal_reference(x_train, weights_train, kernel_name)
#     bandwidth = newton(eq, bandwidth0)
#     return bandwidth
#
#
# def ml_cv(
#     x_train: ndarray,
#     kernel_name: str = "gaussian",
#     weights_train: Optional[ndarray] = None,
# ):
#     """Likelihood cross-validation.
#
#     See paragraph (3.4.4) in [1].
#
#     Parameters
#     ----------
#     x_train : ndarray of shape (m_train, n)
#         Data points as an array containing data with float type.
#     kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian'  # noqa
#         Name of kernel function.
#     weights_train : ndarray of shape (m_train,), optional
#         Weights of data points. If None, all points are equally weighted.
#
#     Returns
#     -------
#     bandwidth : ndarray of shape (n,)
#         Smoothing parameter.
#
#     Examples
#     --------
#     >>> x_train = np.random.normal(0, 1, size=(100, 1))
#     >>> m_train = x_train.shape[0]
#     >>> weights_train = np.full(m_train, 1 / m_train)
#     >>> bandwidth = ml_cv(x_train, "gaussian", weights_train)
#
#     References
#     ----------
#     [1] Silverman, B. W. Density Estimation for Statistics and Data Analysis.
#     Chapman and Hall, 1986.
#     """
#     if x_train.ndim != 2:
#         raise ValueError("invalid shape of 'x_train' - should be 2d")
#
#     if kernel_name not in kernel_properties:
#         available_kernels = list(kernel_properties.keys())
#         raise ValueError(f"invalid 'kernel_name' - try one of {available_kernels}")
#
#     if weights_train is not None:
#         if len(weights_train.shape) != 1:
#             raise ValueError("invalid shape of 'weights_train' - should be 1d")
#         if not (weights_train > 0).all():
#             raise ValueError("'weights_train' must be positive")
#         weights_train = weights_train / weights_train.sum()
#     else:
#         m_train = x_train.shape[0]
#         weights_train = np.full(m_train, 1 / m_train)
#
#     # Minimize the equation with nelder-mead method
#     bandwidth0 = normal_reference(x_train, weights_train, kernel_name)
#     smallest_pos_num = np.nextafter(0, 1)
#
#     def eq(h):
#         scores = compute_unbiased_kde(x_train, weights_train, h, kernel_name)
#         return -np.mean(np.log(scores + smallest_pos_num))
#
#     bounds = Bounds(smallest_pos_num, np.inf)
#     res = minimize(eq, bandwidth0, method="nelder-mead", bounds=bounds)
#     bandwidth = res.x
#     return bandwidth