Source code for kdelearn.bandwidth_selection

from typing import Optional

import numpy as np
from numpy import ndarray

from .cutils import isdd  # , compute_unbiased_kde

# from scipy.optimize import Bounds, minimize, newton


kernel_properties = {
    "gaussian": (1 / (2 * np.sqrt(np.pi)), 1),
    "uniform": (0.5, 1 / 3),
    "epanechnikov": (0.6, 0.2),
    "cauchy": (5 / (4 * np.pi), 1),
}


[docs]def normal_reference( x_train: ndarray, weights_train: Optional[ndarray] = None, kernel_name: str = "gaussian", ) -> ndarray: """AMISE-optimal bandwidth for the (assuming) gaussian density. See paragraph (3.2.1) in [1]. Parameters ---------- x_train : ndarray of shape (m_train, n) Data points containing data with float type. weights_train : ndarray of shape (m_train,), optional Weights of data points. If None, all data points are equally weighted. kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian' Name of kernel function. Returns ------- bandwidth : ndarray of shape (n,) Smoothing parameter for scaling the estimator. Examples -------- >>> x_train = np.random.normal(0, 1, size=(100, 1)) >>> bandwidth = normal_reference(x_train, kernel_name="gaussian") References ---------- [1] Wand, M. P. and Jones, M. C. Kernel Smoothing. Chapman and Hall, 1995. """ if x_train.ndim != 2: raise ValueError("invalid shape of 'x_train' - should be 2d") m_train = x_train.shape[0] if weights_train is None: weights_train = np.full(m_train, 1 / m_train) else: if weights_train.ndim != 1: raise ValueError("invalid shape of 'weights_train' - should be 1d") if weights_train.shape[0] != x_train.shape[0]: raise ValueError("invalid size of 'weights_train'") if not (weights_train >= 0).all(): raise ValueError("'weights_train' should be non negative") weights_train = weights_train / weights_train.sum() if kernel_name not in kernel_properties: available_kernels = list(kernel_properties.keys()) raise ValueError(f"invalid 'kernel_name' - choose one of {available_kernels}") m_train = x_train.shape[0] # Unbiased weighted standard deviation x_mean = np.average(x_train, weights=weights_train, axis=0) x_var = np.average((x_train - x_mean) ** 2, weights=weights_train, axis=0) weighted_std_x = np.sqrt(m_train / (m_train - 1) * x_var) wk, uk = kernel_properties[kernel_name] zf = 3 / (8 * np.sqrt(np.pi) * weighted_std_x**5) bandwidth = (wk / (uk**2 * zf * m_train)) ** 0.2 return bandwidth
[docs]def direct_plugin( x_train: ndarray, weights_train: Optional[ndarray] = None, kernel_name: str = "gaussian", stage: int = 2, ): """Direct plug-in method with gaussian kernel used in estimation of integrated squared density derivatives limited to maximum value of `stage` equal to 3. See paragraph (3.6.1) in [1]. Parameters ---------- x_train : ndarray of shape (m_train, n) Data points containing data with float type. weights_train : ndarray of shape (m_train,), optional Weights of data points. If None, all data points are equally weighted. kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian' Name of kernel function. stage : int, default=2 Depth of plugging-in (max 3). Returns ------- bandwidth : ndarray of shape (n,) Smoothing parameter for scaling the estimator. Examples -------- >>> x_train = np.random.normal(0, 1, size=(100, 1)) >>> bandwidth = direct_plugin(x_train, kernel_name="gaussian", stage=2) References ---------- [1] Wand, M. P. and Jones, M. C. Kernel Smoothing. Chapman and Hall, 1995. """ if x_train.ndim != 2: raise ValueError("invalid shape of 'x_train' - should be 2d") m_train = x_train.shape[0] if weights_train is None: weights_train = np.full(m_train, 1 / m_train) else: if weights_train.ndim != 1: raise ValueError("invalid shape of 'weights_train' - should be 1d") if weights_train.shape[0] != x_train.shape[0]: raise ValueError("invalid size of 'weights_train'") if not (weights_train >= 0).all(): raise ValueError("'weights_train' should be non negative") weights_train = weights_train / weights_train.sum() if kernel_name not in kernel_properties: available_kernels = list(kernel_properties.keys()) raise ValueError(f"invalid 'kernel_name' - choose one of {available_kernels}") if not isinstance(stage, int): raise ValueError("invalid type of 'stage' - should be of an int type") if stage < 0 or stage > 3: raise ValueError("invalid 'stage' - should be greater than 0 and less than 4") m_train = x_train.shape[0] # Unbiased weighted standard deviation x_mean = np.average(x_train, weights=weights_train, axis=0) x_var = np.average((x_train - x_mean) ** 2, weights=weights_train, axis=0) weighted_std_x = np.sqrt(m_train / (m_train - 1) * x_var) wk, uk = kernel_properties[kernel_name] def _psi(r): n = (-1) ** (0.5 * r) * np.math.factorial(r) tmp = np.math.factorial(int(0.5 * r)) * np.sqrt(np.pi) d = (2 * weighted_std_x) ** (r + 1) * tmp return n / d def _bw(gd, zf, b): # There is hidden uk variable in denominator (equal to 1 for gaussian kernel) return (-2 * gd / (zf * m_train)) ** (1 / (b + 1)) # Gaussian derivatives of order 4, 6 and 8 at zero arg gds = {"gd8": 41.888939, "gd6": -5.984134, "gd4": 1.196827} r = 2 * stage + 4 zf = _psi(r) while r != 4: r -= 2 gd_at_zero = gds[f"gd{r}"] bw = _bw(gd_at_zero, zf, r + 2) zf = isdd(x_train, bw, r) bandwidth = (wk / (uk**2 * zf * m_train)) ** 0.2 return bandwidth
# def ste_plugin( # x_train: ndarray, # weights_train: Optional[ndarray] = None, # kernel_name: str = "gaussian", # ): # """Solve-the-equation plug-in method. # # See paragraph (3.6.2) in [1]. # # Parameters # ---------- # x_train : ndarray of shape (m_train, n) # Data points as an array containing data with float type. # weights_train : ndarray of shape (m_train,), optional # Weights of data points. If None, all points are equally weighted. # kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian' # noqa # Name of kernel function. # # Returns # ------- # bandwidth : ndarray of shape (n,) # Smoothing parameter. # # Examples # -------- # >>> x_train = np.random.normal(0, 1, size=(100, 1)) # >>> bandwidth = ste_plugin(x_train, kernel_name="gaussian") # # References # ---------- # [1] Wand, M. P. and Jones, M. C. Kernel Smoothing. Chapman and Hall, 1995. # """ # if x_train.ndim != 2: # raise ValueError("invalid shape of 'x_train' - should be 2d") # # if kernel_name not in kernel_properties: # available_kernels = list(kernel_properties.keys()) # raise ValueError(f"invalid 'kernel_name' - try one of {available_kernels}") # # m_train = x_train.shape[0] # if weights_train is None: # weights_train = np.full(m_train, 1 / m_train) # # # Unbiased weighted standard deviation # x_mean = np.average(x_train, weights=weights_train, axis=0) # x_var = np.average((x_train - x_mean) ** 2, weights=weights_train, axis=0) # weighted_std_x = np.sqrt(m_train / (m_train - 1) * x_var) # wk, uk = kernel_properties[kernel_name] # # def eq(h): # a = 0.920 * weighted_std_x * m_train ** (-1 / 7) # b = 0.912 * weighted_std_x * m_train ** (-1 / 9) # sda = isdd(x_train, weights_train, a, 4) # tdb = -isdd(x_train, weights_train, b, 6) # # alpha2 = 1.357 * (sda / tdb) ** (1 / 7) * h ** (5 / 7) # sdalpha2 = isdd(x_train, weights_train, alpha2, 4) # return (wk / (uk**2 * sdalpha2 * m_train)) ** 0.2 - h # # # Solve the equation using secant method # bandwidth0 = normal_reference(x_train, weights_train, kernel_name) # bandwidth = newton(eq, bandwidth0) # return bandwidth # # # def ml_cv( # x_train: ndarray, # kernel_name: str = "gaussian", # weights_train: Optional[ndarray] = None, # ): # """Likelihood cross-validation. # # See paragraph (3.4.4) in [1]. # # Parameters # ---------- # x_train : ndarray of shape (m_train, n) # Data points as an array containing data with float type. # kernel_name : {'gaussian', 'uniform', 'epanechnikov', 'cauchy'}, default='gaussian' # noqa # Name of kernel function. # weights_train : ndarray of shape (m_train,), optional # Weights of data points. If None, all points are equally weighted. # # Returns # ------- # bandwidth : ndarray of shape (n,) # Smoothing parameter. # # Examples # -------- # >>> x_train = np.random.normal(0, 1, size=(100, 1)) # >>> m_train = x_train.shape[0] # >>> weights_train = np.full(m_train, 1 / m_train) # >>> bandwidth = ml_cv(x_train, "gaussian", weights_train) # # References # ---------- # [1] Silverman, B. W. Density Estimation for Statistics and Data Analysis. # Chapman and Hall, 1986. # """ # if x_train.ndim != 2: # raise ValueError("invalid shape of 'x_train' - should be 2d") # # if kernel_name not in kernel_properties: # available_kernels = list(kernel_properties.keys()) # raise ValueError(f"invalid 'kernel_name' - try one of {available_kernels}") # # if weights_train is not None: # if len(weights_train.shape) != 1: # raise ValueError("invalid shape of 'weights_train' - should be 1d") # if not (weights_train > 0).all(): # raise ValueError("'weights_train' must be positive") # weights_train = weights_train / weights_train.sum() # else: # m_train = x_train.shape[0] # weights_train = np.full(m_train, 1 / m_train) # # # Minimize the equation with nelder-mead method # bandwidth0 = normal_reference(x_train, weights_train, kernel_name) # smallest_pos_num = np.nextafter(0, 1) # # def eq(h): # scores = compute_unbiased_kde(x_train, weights_train, h, kernel_name) # return -np.mean(np.log(scores + smallest_pos_num)) # # bounds = Bounds(smallest_pos_num, np.inf) # res = minimize(eq, bandwidth0, method="nelder-mead", bounds=bounds) # bandwidth = res.x # return bandwidth