Source code for vinecopulas.marginals

# -*- coding: utf-8 -*-
"""
Created on Thu Feb 22 17:13:20 2024


"""


import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.stats import rankdata
from scipy.optimize import newton
from scipy.optimize import minimize_scalar
from scipy.special import gammaln
from scipy.linalg import cholesky
from itertools import product
import sys

# %% best fit discrete distribution


[docs]
def best_fit_distributiondiscrete(data, bound=False, criterion="BIC"):
    """
    Fits the best discrete distribution to data.

    Arguments:
        *data* : The data which has to be fit as a 1-d numpy array.

        *bounds* : whether the data is bounded

        *criterion* : Either BIC, AIC or ML is used to choose the best distribution

    Returns:
     *bestdist* : the best distribution and its parameters.
    """

    # distributions
    distributions = {
        "Betabinomial": st.betabinom,
        "Binomial": st.binom,
        "Poisson": st.poisson,
        "Geometric": st.geom,
        "Hypergeometric": st.hypergeom,
        "Negative Binomial": st.nbinom,
        "Negative Hypergeometric": st.nhypergeom,
        "Zipfian": st.zipfian,
        "Log-Series": st.logser,
        "Laplacian": st.dlaplace,

    }
    if bound == True:
        distributions = {
            "Betabinomial": st.betabinom,
            "Binomial": st.binom,
            "Poisson": st.poisson,
            "Geometric": st.geom,
            "Hypergeometric": st.hypergeom,
            "Negative Binomial": st.nbinom,
        }

    # Best holders
    best_distributions = []

    # Estimate distribution parameters from data
    for name, distribution in distributions.items():

        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")
                if name == "Betabinomial":
                    bounds = (
                        (0, len(np.unique(data))),
                        (0, len(np.unique(data))),
                        (0, len(np.unique(data))),
                    )
                elif name == "Hypergeometric" or name == "Negative Hypergeometric":
                    bounds = (
                        (0, len((data))),
                        (0, len(np.unique(data))),
                        (0, len((data))),
                    )
                elif (
                    name == "Binomial"
                    or name == "Negative Binomial"
                    or name == "Zipfian"
                    or name == "Zipf"
                    or name == "Boltzmann"
                    or name == "Planck"
                ):
                    bounds = ((0, len(data)), (0, len(data)))
                elif (
                    name == "Poisson"
                    or name == "Geom"
                    or name == "Log-Series"
                    or name == "Laplacian"
                    or name == "Yule-Simon"
                ):
                    bounds = ((0, max(data)), (0, max(data)))
                elif name == "Fisher" or name == "Wallenius":
                    bounds = (
                        (0, len(data)),
                        (0, len(data)),
                        (0, len(data)),
                        (0, len(data)),
                    )
                elif name == "Bernoulli":
                    bounds = None
                # fit dist to data
                params = st.fit(distribution, data, bounds).params[:]

                # Separate parts of parameters

                # Calculate fitted PDF and error with fit in distribution

                n = len(data)
                k = len(params)

                log_likelihood = distribution(*params).logpmf(data).sum()

                if criterion == "BIC":
                    criterion_value = -2 * log_likelihood + np.log(n) * k
                elif criterion == "AIC":
                    criterion_value = 2 * k - 2 * log_likelihood
                elif criterion == "ML":
                    criterion_value = log_likelihood

                # identify if this distribution is better
                best_distributions.append((distribution, params, criterion_value))

        except Exception:
            pass

        bestdist = sorted(best_distributions, key=lambda x: x[2])[0]
    return bestdist



# %% best fit distribution

[docs]
def best_fit_distribution(data, criterion="BIC", dists = []):
    """
    Fits the best continuous distribution to data.

    Arguments:
        *data* : The data which has to be fit as a 1-d numpy array.

        *criterion* : Either BIC, AIC or ML is used to choose the best distribution
        
        *dists* : Specify specific distributions if only specific distributions need to be tested, provided as a list.

    Returns:
     *bestdist* : the best distribution and its parameters.
    """
    # distributions
    distributions = {
        "Beta": st.beta,
        "Birnbaum-Saunders": st.burr,
        "Exponential": st.expon,
        "Extreme value": st.genextreme,
        "Gamma": st.gamma,
        "Generalized extreme value": st.genextreme,
        "Generalized Pareto": st.genpareto,
        "Inverse Gaussian": st.invgauss,
        "Logistic": st.logistic,
        "Log-logistic": st.fisk,
        "Lognormal": st.lognorm,
        "Nakagami": st.nakagami,
        "Normal": st.norm,
        "Rayleigh": st.rayleigh,
        "Rician": st.rice,
        "t location-scale": st.t,
        "Weibull": st.weibull_min,
    }
    
    if len(dists)> 0:
        keys_list = list(distributions.keys())
    
        keys_list2 = []
        for i in dists:
            keys_list2.append(keys_list[i])
            
        distributions = dict((k, distributions[k]) for k in keys_list2
               if k in distributions)
    
   
        
    
    
    

    # Best holders
    best_distributions = []

    # Estimate distribution parameters from data
    for name, distribution in distributions.items():

        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution

                n = len(data)
                k = len(params)

                log_likelihood = (
                    distribution(loc=loc, scale=scale, *arg).logpdf(data).sum()
                )

                if criterion == "BIC":
                    criterion_value = -2 * log_likelihood + np.log(n) * k
                elif criterion == "AIC":
                    criterion_value = 2 * k - 2 * log_likelihood
                elif criterion == "ML":
                    criterion_value = log_likelihood

                # identify if this distribution is better
                best_distributions.append((distribution, params, criterion_value))

        except Exception:
            pass

    return sorted(best_distributions, key=lambda x: x[2])[0]



# %% pseudodata

[docs]
def pseudodata(data):
    """
    Compute the pseudo-observations for the given data (transforms data to standard uniform margins)

    Arguments:
        *data* : The data which has to be converted into pseudo data, provided as a numpy array where each column contains a separate variable (eg. x1,x2,...,xn)


    Returns:
     *u* : Pseudo data, provided as a numpy array where each column contains a separate variable (eg. u1,u2,...,un)
    """

    ranks = np.apply_along_axis(rankdata, axis=0, arr=data)
    n = data.shape[0]
    u = (ranks - 1) / (n - 1)

    return u



# %%

[docs]
def pseudodiscr(xcdf, xpmf):
    """
    Compute the pseudo-observations for the given variable that is discrete.

    Arguments:
        *xcdf* : The cumulative distribution function of the variable, calculated based on the best fit discrete distribution, provided as a 1-d numpy array.

        *xpmf* : The probability mass function of the variable, calculated based on the best fit discrete distribution, provided as a 1-d numpy array.

    Returns:
     *ui* : Pseudo data of a given variable  provided as a 1-d numpy array.
    """

    # Reference: Mitskopoulos et al. 2022
    u113 = xcdf - xpmf
    ru = np.random.uniform(0, 1, len(xcdf))
    ui = u113 + ru * (xcdf - u113)
    return ui