o
    Rhe                     @   sh  d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZ dd	lmZmZmZmZmZmZ G d
d dZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'G d d! d!eZ(eee e!e"e#e$e&e'e(d"
Z)dS )#z
This module contains loss classes suitable for fitting.

It is not part of the public API.
Specific losses are used for regression, binary classification or multiclass
classification.
    Nxlogy   )check_scalar)_weighted_percentile   )CyAbsoluteErrorCyExponentialLossCyHalfBinomialLossCyHalfGammaLossCyHalfMultinomialLossCyHalfPoissonLossCyHalfSquaredErrorCyHalfTweedieLossCyHalfTweedieLossIdentityCyHuberLossCyPinballLoss)HalfLogitLinkIdentityLinkInterval	LogitLinkLogLinkMultinomialLogitc                   @   s   e Zd ZdZdZdZdZdddZdd Zd	d
 Z				dddZ
				dddZ			dddZ				dddZd ddZdddZdddZejdfddZdS )!BaseLossa  Base class for a loss function of 1-dimensional targets.

    Conventions:

        - y_true.shape = sample_weight.shape = (n_samples,)
        - y_pred.shape = raw_prediction.shape = (n_samples,)
        - If is_multiclass is true (multiclass classification), then
          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
          Note that this corresponds to the return value of decision_function.

    y_true, y_pred, sample_weight and raw_prediction must either be all float64
    or all float32.
    gradient and hessian must be either both float64 or both float32.

    Note that y_pred = link.inverse(raw_prediction).

    Specific loss classes can inherit specific link classes to satisfy
    BaseLink's abstractmethods.

    Parameters
    ----------
    sample_weight : {None, ndarray}
        If sample_weight is None, the hessian might be constant.
    n_classes : {None, int}
        The number of classes for classification, else None.

    Attributes
    ----------
    closs: CyLossFunction
    link : BaseLink
    interval_y_true : Interval
        Valid interval for y_true
    interval_y_pred : Interval
        Valid Interval for y_pred
    differentiable : bool
        Indicates whether or not loss function is differentiable in
        raw_prediction everywhere.
    need_update_leaves_values : bool
        Indicates whether decision trees in gradient boosting need to uptade
        leave values after having been fit to the (negative) gradients.
    approx_hessian : bool
        Indicates whether the hessian is approximated or exact. If,
        approximated, it should be larger or equal to the exact one.
    constant_hessian : bool
        Indicates whether the hessian is one for this loss.
    is_multiclass : bool
        Indicates whether n_classes > 2 is allowed.
    TFNc                 C   sB   || _ || _d| _d| _|| _ttj tjdd| _| jj	| _	d S )NF)
closslinkapprox_hessianconstant_hessian	n_classesr   npinfinterval_y_trueinterval_y_pred)selfr   r   r    r$   T/home/air/sanwanet/backup_V2/venv/lib/python3.10/site-packages/sklearn/_loss/loss.py__init__   s   zBaseLoss.__init__c                 C      | j |S zuReturn True if y is in the valid range of y_true.

        Parameters
        ----------
        y : ndarray
        )r!   includesr#   yr$   r$   r%   in_y_true_range      zBaseLoss.in_y_true_rangec                 C   r'   )zuReturn True if y is in the valid range of y_pred.

        Parameters
        ----------
        y : ndarray
        )r"   r)   r*   r$   r$   r%   in_y_pred_range   r-   zBaseLoss.in_y_pred_ranger   c                 C   sN   |du r	t |}|jdkr|jd dkr|d}| jj|||||d |S )aJ  Compute the pointwise loss value for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.
        Nr   r   y_trueraw_predictionsample_weightloss_out	n_threads)r   
empty_likendimshapesqueezer   loss)r#   r0   r1   r2   r3   r4   r$   r$   r%   r9      s   

zBaseLoss.lossc                 C   s   |du r|du rt |}t |}nt j||jd}n|du r(t j||jd}|jdkr9|jd dkr9|d}|jdkrJ|jd dkrJ|d}| jj||||||d ||fS )a  Compute loss and gradient w.r.t. raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        loss_out : None or C-contiguous array of shape (n_samples,)
            A location into which the loss is stored. If None, a new array
            might be created.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : array of shape (n_samples,)
            Element-wise loss function.

        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Ndtyper   r   )r0   r1   r2   r3   gradient_outr4   )r   r5   r;   r6   r7   r8   r   loss_gradient)r#   r0   r1   r2   r3   r<   r4   r$   r$   r%   r=      s(   &


zBaseLoss.loss_gradientc                 C   sp   |du r	t |}|jdkr|jd dkr|d}|jdkr+|jd dkr+|d}| jj|||||d |S )a  Compute gradient of loss w.r.t raw_prediction for each input.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the result is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.
        Nr   r   )r0   r1   r2   r<   r4   )r   r5   r6   r7   r8   r   gradient)r#   r0   r1   r2   r<   r4   r$   r$   r%   r>     s   


zBaseLoss.gradientc                 C   s   |du r|du rt |}t |}nt |}n	|du r"t |}|jdkr3|jd dkr3|d}|jdkrD|jd dkrD|d}|jdkrU|jd dkrU|d}| jj||||||d ||fS )a  Compute gradient and hessian of loss w.r.t raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or C-contiguous array of shape (n_samples,) or array             of shape (n_samples, n_classes)
            A location into which the hessian is stored. If None, a new array
            might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise gradients.

        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
            Element-wise hessians.
        Nr   r   )r0   r1   r2   r<   hessian_outr4   )r   r5   r6   r7   r8   r   gradient_hessian)r#   r0   r1   r2   r<   r?   r4   r$   r$   r%   r@   =  s,   '




zBaseLoss.gradient_hessianc                 C   s   t j| j||dd|d|dS )a{  Compute the weighted average loss.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        loss : float
            Mean or averaged loss function.
        Nr/   weights)r   averager9   )r#   r0   r1   r2   r4   r$   r$   r%   __call__  s   zBaseLoss.__call__c                 C   s   t j||dd}dt |jj }| jjt j krd}n| jjr%| jj}n| jj| }| jj	t jkr5d}n| jj
r>| jj	}n| jj	| }|du rR|du rR| j|S | jt |||S )a#  Compute raw_prediction of an intercept-only model.

        This can be used as initial estimates of predictions, i.e. before the
        first iteration in fit.

        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or array of shape (n_samples,)
            Sample weights.

        Returns
        -------
        raw_prediction : numpy scalar or array of shape (n_classes,)
            Raw predictions of an intercept-only model.
        r   rB   axis
   N)r   rC   finfor;   epsr"   lowr    low_inclusivehighhigh_inclusiver   clip)r#   r0   r2   y_predrI   a_mina_maxr$   r$   r%   fit_intercept_only  s   

zBaseLoss.fit_intercept_onlyc                 C   s
   t |S )zpCalculate term dropped in loss.

        With this term added, the loss of perfect predictions is zero.
        )r   
zeros_liker#   r0   r2   r$   r$   r%   constant_to_optimal_zero  s   
z!BaseLoss.constant_to_optimal_zeroFc                 C   s|   |t jt jfvrtd| d| jr|| jf}n|f}t j|||d}| jr2t jd|d}||fS t j|||d}||fS )au  Initialize arrays for gradients and hessians.

        Unless hessians are constant, arrays are initialized with undefined values.

        Parameters
        ----------
        n_samples : int
            The number of samples, usually passed to `fit()`.
        dtype : {np.float64, np.float32}, default=np.float64
            The dtype of the arrays gradient and hessian.
        order : {'C', 'F'}, default='F'
            Order of the arrays gradient and hessian. The default 'F' makes the arrays
            contiguous along samples.

        Returns
        -------
        gradient : C-contiguous array of shape (n_samples,) or array of shape             (n_samples, n_classes)
            Empty array (allocated but not initialized) to be used as argument
            gradient_out.
        hessian : C-contiguous array of shape (n_samples,), array of shape
            (n_samples, n_classes) or shape (1,)
            Empty (allocated but not initialized) array to be used as argument
            hessian_out.
            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
            initialized to ``1``.
        zCValid options for 'dtype' are np.float32 and np.float64. Got dtype=z	 instead.)r7   r;   order)r   )r7   r;   )	r   float32float64
ValueErroris_multiclassr   emptyr   ones)r#   	n_samplesr;   rW   r7   r>   hessianr$   r$   r%   init_gradient_and_hessian  s   z"BaseLoss.init_gradient_and_hessianN)NNr   NNNr   Nr   )__name__
__module____qualname____doc__differentiableneed_update_leaves_valuesr[   r&   r,   r.   r9   r=   r>   r@   rD   rR   rU   r   rY   r`   r$   r$   r$   r%   r   F   s<    :
		
1
C
5

B

*r   c                       s"   e Zd ZdZd fdd	Z  ZS )HalfSquaredErrora  Half squared error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half squared error is defined as::

        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2

    The factor of 0.5 simplifies the computation of gradients and results in a
    unit hessian (and is consistent with what is done in LightGBM). It is also
    half the Normal distribution deviance.
    Nc                    s"   t  jt t d |d u | _d S )Nr   r   )superr&   r   r   r   r#   r2   	__class__r$   r%   r&     s   zHalfSquaredError.__init__ra   rd   re   rf   rg   r&   __classcell__r$   r$   rn   r%   rj     s    rj   c                       s4   e Zd ZdZdZdZd	 fdd	Zd	ddZ  ZS )
AbsoluteErrora  Absolute error with identity link, for regression.

    Domain:
    y_true and y_pred all real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the absolute error is defined as::

        loss(x_i) = |y_true_i - raw_prediction_i|

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.
    FTNc                    s(   t  jt t d d| _|d u | _d S )Nrk   T)rl   r&   r   r   r   r   rm   rn   r$   r%   r&   3  s   zAbsoluteError.__init__c                 C   s"   |du rt j|ddS t||dS )Compute raw_prediction of an intercept-only model.

        This is the weighted median of the target, i.e. over the samples
        axis=0.
        Nr   rF   2   )r   medianr   rT   r$   r$   r%   rR   8  s   z AbsoluteError.fit_intercept_onlyra   	rd   re   rf   rg   rh   ri   r&   rR   rq   r$   r$   rn   r%   rr     s    rr   c                       s4   e Zd ZdZdZdZd
 fdd	Zddd	Z  ZS )PinballLossa  Quantile loss aka pinball loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the pinball loss is defined as::

        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)

        rho_{quantile}(u) = u * (quantile - 1_{u<0})
                          = -u *(1 - quantile)  if u < 0
                             u * quantile       if u >= 0

    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().

    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
    differentiable = False). Optimization routines like in HGBT, however, need a
    hessian > 0. Therefore, we assign 1.

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level of the quantile to be estimated. Must be in range (0, 1).
    FTN      ?c                    sF   t |dtjdddd t jtt|dt d d| _|d u | _	d S )	Nquantiler   r   neithertarget_typemin_valmax_valinclude_boundaries)rz   rk   T)
r   numbersRealrl   r&   r   floatr   r   r   )r#   r2   rz   rn   r$   r%   r&   e  s   zPinballLoss.__init__c                 C   s4   |du rt j|d| jj ddS t||d| jj S )rs   Nd   r   rt   )r   
percentiler   rz   r   rT   r$   r$   r%   rR   u  s
   zPinballLoss.fit_intercept_only)Nry   ra   rw   r$   r$   rn   r%   rx   D  s    rx   c                       s4   e Zd ZdZdZdZd fdd	Zdd	d
Z  ZS )	HuberLossa  Huber loss, for regression.

    Domain:
    y_true and y_pred all real numbers
    quantile in (0, 1)

    Link:
    y_pred = raw_prediction

    For a given sample x_i, the Huber loss is defined as::

        loss(x_i) = 1/2 * abserr**2            if abserr <= delta
                    delta * (abserr - delta/2) if abserr > delta

        abserr = |y_true_i - raw_prediction_i|
        delta = quantile(abserr, self.quantile)

    Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
    equals delta * (AbsoluteError() - delta/2).

    Additional Attributes
    ---------------------
    quantile : float
        The quantile level which defines the breaking point `delta` to distinguish
        between absolute error and squared error. Must be in range (0, 1).

     Reference
    ---------
    .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
      boosting machine <10.1214/aos/1013203451>`.
      Annals of Statistics, 29, 1189-1232.
    FTN?ry   c                    sH   t |dtjdddd || _t jtt|dt d d| _	d	| _
d S )
Nrz   r   r   r{   r|   )deltark   TF)r   r   r   rz   rl   r&   r   r   r   r   r   )r#   r2   rz   r   rn   r$   r%   r&     s   
zHuberLoss.__init__c                 C   s`   |du rt j|ddd}nt||d}|| }t |t | jjt | }|t j||d S )rs   Nru   r   rt   rA   )	r   r   r   signminimumr   r   absrC   )r#   r0   r2   rv   difftermr$   r$   r%   rR     s   
 zHuberLoss.fit_intercept_only)Nr   ry   ra   rw   r$   r$   rn   r%   r     s    !r   c                       ,   e Zd ZdZd fdd	ZdddZ  ZS )HalfPoissonLossa  Half Poisson deviance loss with log-link, for regression.

    Domain:
    y_true in non-negative real numbers
    y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half the Poisson deviance is defined as::

        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                    - y_true_i + exp(raw_prediction_i)

    Half the Poisson deviance is actually the negative log-likelihood up to
    constant terms (not involving raw_prediction) and simplifies the
    computation of the gradients.
    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
    Nc                    s*   t  jt t d tdtjdd| _d S )Nrk   r   TF)rl   r&   r   r   r   r   r    r!   rm   rn   r$   r%   r&        zHalfPoissonLoss.__init__c                 C   s"   t ||| }|d ur||9 }|S ra   r   r#   r0   r2   r   r$   r$   r%   rU     s   z(HalfPoissonLoss.constant_to_optimal_zerora   rd   re   rf   rg   r&   rU   rq   r$   r$   rn   r%   r     s    r   c                       r   )HalfGammaLossaV  Half Gamma deviance loss with log-link, for regression.

    Domain:
    y_true and y_pred in positive real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Gamma deviance loss is defined as::

        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                    + y_true/exp(raw_prediction_i) - 1

    Half the Gamma deviance is actually proportional to the negative log-
    likelihood up to constant terms (not involving raw_prediction) and
    simplifies the computation of the gradients.
    We also skip the constant term `-log(y_true_i) - 1`.
    Nc                    s*   t  jt t d tdtjdd| _d S )Nrk   r   F)rl   r&   r   r   r   r   r    r!   rm   rn   r$   r%   r&      r   zHalfGammaLoss.__init__c                 C   s$   t | d }|d ur||9 }|S rc   )r   logr   r$   r$   r%   rU     s   z&HalfGammaLoss.constant_to_optimal_zerora   r   r$   r$   rn   r%   r     s    r   c                       s,   e Zd ZdZd fdd	Zd	ddZ  ZS )
HalfTweedieLossa  Half Tweedie deviance loss with log-link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers
    power in real numbers

    Link:
    y_pred = exp(raw_prediction)

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
                    + exp(raw_prediction_i)**(2-p) / (2-p)

    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
    HalfPoissonLoss and HalfGammaLoss.

    We also skip constant terms, but those are different for p=0, 1, 2.
    Therefore, the loss is not continuous in `power`.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    N      ?c                    sz   t  jtt|dt d | jjdkr!ttj	 tj	dd| _
d S | jjdk r2tdtj	dd| _
d S tdtj	dd| _
d S N)powerrk   r   Fr   T)rl   r&   r   r   r   r   r   r   r   r    r!   r#   r2   r   rn   r$   r%   r&   *  s   zHalfTweedieLoss.__init__c                 C   s   | j jdkrt j||dS | j jdkrt j||dS | j jdkr*t j||dS | j j}tt|dd| d|  d|  }|d urJ||9 }|S )Nr   )r0   r2   r   r   )r   r   rj   rU   r   r   r   maximum)r#   r0   r2   pr   r$   r$   r%   rU   6  s"   (z(HalfTweedieLoss.constant_to_optimal_zeroNr   ra   r   r$   r$   rn   r%   r     s    r   c                       s"   e Zd ZdZd fdd	Z  ZS )HalfTweedieLossIdentityan  Half Tweedie deviance loss with identity link, for regression.

    Domain:
    y_true in real numbers for power <= 0
    y_true in non-negative real numbers for 0 < power < 2
    y_true in positive real numbers for 2 <= power
    y_pred in positive real numbers for power != 0
    y_pred in real numbers for power = 0
    power in real numbers

    Link:
    y_pred = raw_prediction

    For a given sample x_i, half Tweedie deviance loss with p=power is defined
    as::

        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
                    + raw_prediction_i**(2-p) / (2-p)

    Note that the minimum value of this loss is 0.

    Note furthermore that although no Tweedie distribution exists for
    0 < power < 1, it still gives a strictly consistent scoring function for
    the expectation.
    Nr   c                    s   t  jtt|dt d | jjdkr ttj	 tj	dd| _
n| jjdk r0tdtj	dd| _
n	tdtj	dd| _
| jjdkrLttj	 tj	dd| _d S tdtj	dd| _d S r   )rl   r&   r   r   r   r   r   r   r   r    r!   r"   r   rn   r$   r%   r&   g  s   z HalfTweedieLossIdentity.__init__r   rp   r$   r$   rn   r%   r   K  s    r   c                       4   e Zd ZdZd	 fdd	Zd	ddZdd Z  ZS )
HalfBinomialLossaY  Half Binomial deviance loss with logit link, for binary classification.

    This is also know as binary cross entropy, log-loss and logistic loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(raw_prediction)

    For a given sample x_i, half Binomial deviance is defined as the negative
    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
    as::

        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i

    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
    section 4.4.1 (about logistic regression).

    Note that the formulation works for classification, y = {0, 1}, as well as
    logistic regression, y = [0, 1].
    If you add `constant_to_optimal_zero` to the loss, you get half the
    Bernoulli/binomial deviance.

    More details: Inserting the predicted probability y_pred = expit(raw_prediction)
    in the loss gives the well known::

        loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
    Nc                    *   t  jt t dd tdddd| _d S Nr   r   r   r   r   r   T)rl   r&   r
   r   r   r!   rm   rn   r$   r%   r&        zHalfBinomialLoss.__init__c                 C   s0   t ||t d| d|  }|d ur||9 }|S rc   r   r   r$   r$   r%   rU     s   z)HalfBinomialLoss.constant_to_optimal_zeroc                 C   x   |j dkr|jd dkr|d}tj|jd df|jd}| j||dddf< d|dddf  |dddf< |S a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, 2)
            Element-wise class probabilities.
        r   r   r   r:   Nr6   r7   r8   r   r\   r;   r   inverser#   r1   probar$   r$   r%   predict_proba     
 zHalfBinomialLoss.predict_probara   rd   re   rf   rg   r&   rU   r   rq   r$   r$   rn   r%   r   y  s
    
r   c                       sR   e Zd ZdZdZd fdd	Zdd Zdd	d
Zdd Z				dddZ	  Z
S )HalfMultinomialLossa  Categorical cross-entropy loss, for multiclass classification.

    Domain:
    y_true in {0, 1, 2, 3, .., n_classes - 1}
    y_pred has n_classes elements, each element in (0, 1)

    Link:
    y_pred = softmax(raw_prediction)

    Note: We assume y_true to be already label encoded. The inverse link is
    softmax. But the full link function is the symmetric multinomial logit
    function.

    For a given sample x_i, the categorical cross-entropy loss is defined as
    the negative log-likelihood of the multinomial distribution, it
    generalizes the binary cross-entropy to more than 2 classes::

        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)

    See [1].

    Note that for the hessian, we calculate only the diagonal part in the
    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
    we calculate H_i_k_k, i.e. k=l.

    Reference
    ---------
    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
        Multinomial Regression".
        <1311.6529>`
    TN   c                    s<   t  jt t |d tdtjdd| _tdddd| _d S )Nr   r   TFr   )	rl   r&   r   r   r   r   r    r!   r"   )r#   r2   r   rn   r$   r%   r&     s   zHalfMultinomialLoss.__init__c                 C   s    | j |ot|t|kS r(   )r!   r)   r   allastypeintr*   r$   r$   r%   r,     s    z#HalfMultinomialLoss.in_y_true_rangec                 C   s   t j| j|jd}t |jj}t| jD ]}t j||k|dd||< t || |d| ||< q| j		|dddf 
dS )zCompute raw_prediction of an intercept-only model.

        This is the softmax of the weighted average of the target, i.e. over
        the samples axis=0.
        r:   r   rE   r   N)r   zerosr   r;   rH   rI   rangerC   rN   r   reshape)r#   r0   r2   outrI   kr$   r$   r%   rR     s   z&HalfMultinomialLoss.fit_intercept_onlyc                 C   r'   )a=  Predict probabilities.

        Parameters
        ----------
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).

        Returns
        -------
        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        )r   r   )r#   r1   r$   r$   r%   r     s   z!HalfMultinomialLoss.predict_probar   c                 C   sd   |du r|du rt |}t |}nt |}n	|du r"t |}| jj||||||d ||fS )aK  Compute gradient and class probabilities fow raw_prediction.

        Parameters
        ----------
        y_true : C-contiguous array of shape (n_samples,)
            Observed, true target values.
        raw_prediction : array of shape (n_samples, n_classes)
            Raw prediction values (in link space).
        sample_weight : None or C-contiguous array of shape (n_samples,)
            Sample weights.
        gradient_out : None or array of shape (n_samples, n_classes)
            A location into which the gradient is stored. If None, a new array
            might be created.
        proba_out : None or array of shape (n_samples, n_classes)
            A location into which the class probabilities are stored. If None,
            a new array might be created.
        n_threads : int, default=1
            Might use openmp thread parallelism.

        Returns
        -------
        gradient : array of shape (n_samples, n_classes)
            Element-wise gradients.

        proba : array of shape (n_samples, n_classes)
            Element-wise class probabilities.
        N)r0   r1   r2   r<   	proba_outr4   )r   r5   r   gradient_proba)r#   r0   r1   r2   r<   r   r4   r$   r$   r%   r     s    $

z"HalfMultinomialLoss.gradient_proba)Nr   ra   rb   )rd   re   rf   rg   r[   r&   r,   rR   r   r   rq   r$   r$   rn   r%   r     s    "	
	r   c                       r   )
ExponentialLossa"  Exponential loss with (half) logit link, for binary classification.

    This is also know as boosting loss.

    Domain:
    y_true in [0, 1], i.e. regression on the unit interval
    y_pred in (0, 1), i.e. boundaries excluded

    Link:
    y_pred = expit(2 * raw_prediction)

    For a given sample x_i, the exponential loss is defined as::

        loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)

    See:
    - J. Friedman, T. Hastie, R. Tibshirani.
      "Additive logistic regression: a statistical view of boosting (With discussion
      and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
      https://doi.org/10.1214/aos/1016218223
    - A. Buja, W. Stuetzle, Y. Shen. (2005).
      "Loss Functions for Binary Class Probability Estimation and Classification:
      Structure and Applications."

    Note that the formulation works for classification, y = {0, 1}, as well as
    "exponential logistic" regression, y = [0, 1].
    Note that this is a proper scoring rule, but without it's canonical link.

    More details: Inserting the predicted probability
    y_pred = expit(2 * raw_prediction) in the loss gives::

        loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
            + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
    Nc                    r   r   )rl   r&   r	   r   r   r!   rm   rn   r$   r%   r&   m  r   zExponentialLoss.__init__c                 C   s*   dt |d|   }|d ur||9 }|S )Nr   )r   sqrtr   r$   r$   r%   rU   u  s   z(ExponentialLoss.constant_to_optimal_zeroc                 C   r   r   r   r   r$   r$   r%   r   |  r   zExponentialLoss.predict_probara   r   r$   r$   rn   r%   r   I  s
    #
r   )
squared_errorabsolute_errorpinball_loss
huber_losspoisson_loss
gamma_losstweedie_lossbinomial_lossmultinomial_lossexponential_loss)*rg   r   numpyr   scipy.specialr   utilsr   utils.statsr   _lossr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rj   rr   rx   r   r   r   r   r   r   r   r   _LOSSESr$   r$   r$   r%   <module>   sF    4    D&?I @.E J
