Source code for vinecopulas.marginals

# -*- coding: utf-8 -*-
"""
Created on Thu Feb 22 17:13:20 2024


"""


import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import scipy.stats as st
from scipy.stats import rankdata
from scipy.optimize import newton
from scipy.optimize import minimize_scalar
from scipy.special import gammaln
from scipy.linalg import cholesky
from itertools import product
import sys

# %% best fit discrete distribution

[docs] def best_fit_distributiondiscrete(data, bound=False, criterion="BIC"): """ Fits the best discrete distribution to data. Arguments: *data* : The data which has to be fit as a 1-d numpy array. *bounds* : whether the data is bounded *criterion* : Either BIC, AIC or ML is used to choose the best distribution Returns: *bestdist* : the best distribution and its parameters. """ # distributions distributions = { "Betabinomial": st.betabinom, "Binomial": st.binom, "Poisson": st.poisson, "Geometric": st.geom, "Hypergeometric": st.hypergeom, "Negative Binomial": st.nbinom, "Negative Hypergeometric": st.nhypergeom, "Zipfian": st.zipfian, "Log-Series": st.logser, "Laplacian": st.dlaplace, } if bound == True: distributions = { "Betabinomial": st.betabinom, "Binomial": st.binom, "Poisson": st.poisson, "Geometric": st.geom, "Hypergeometric": st.hypergeom, "Negative Binomial": st.nbinom, } # Best holders best_distributions = [] # Estimate distribution parameters from data for name, distribution in distributions.items(): try: # Ignore warnings from data that can't be fit with warnings.catch_warnings(): warnings.filterwarnings("ignore") if name == "Betabinomial": bounds = ( (0, len(np.unique(data))), (0, len(np.unique(data))), (0, len(np.unique(data))), ) elif name == "Hypergeometric" or name == "Negative Hypergeometric": bounds = ( (0, len((data))), (0, len(np.unique(data))), (0, len((data))), ) elif ( name == "Binomial" or name == "Negative Binomial" or name == "Zipfian" or name == "Zipf" or name == "Boltzmann" or name == "Planck" ): bounds = ((0, len(data)), (0, len(data))) elif ( name == "Poisson" or name == "Geom" or name == "Log-Series" or name == "Laplacian" or name == "Yule-Simon" ): bounds = ((0, max(data)), (0, max(data))) elif name == "Fisher" or name == "Wallenius": bounds = ( (0, len(data)), (0, len(data)), (0, len(data)), (0, len(data)), ) elif name == "Bernoulli": bounds = None # fit dist to data params = st.fit(distribution, data, bounds).params[:] # Separate parts of parameters # Calculate fitted PDF and error with fit in distribution n = len(data) k = len(params) log_likelihood = distribution(*params).logpmf(data).sum() if criterion == "BIC": criterion_value = -2 * log_likelihood + np.log(n) * k elif criterion == "AIC": criterion_value = 2 * k - 2 * log_likelihood elif criterion == "ML": criterion_value = log_likelihood # identify if this distribution is better best_distributions.append((distribution, params, criterion_value)) except Exception: pass bestdist = sorted(best_distributions, key=lambda x: x[2])[0] return bestdist
# %% best fit distribution
[docs] def best_fit_distribution(data, criterion="BIC", dists = []): """ Fits the best continuous distribution to data. Arguments: *data* : The data which has to be fit as a 1-d numpy array. *criterion* : Either BIC, AIC or ML is used to choose the best distribution *dists* : Specify specific distributions if only specific distributions need to be tested, provided as a list. Returns: *bestdist* : the best distribution and its parameters. """ # distributions distributions = { "Beta": st.beta, "Birnbaum-Saunders": st.burr, "Exponential": st.expon, "Extreme value": st.genextreme, "Gamma": st.gamma, "Generalized extreme value": st.genextreme, "Generalized Pareto": st.genpareto, "Inverse Gaussian": st.invgauss, "Logistic": st.logistic, "Log-logistic": st.fisk, "Lognormal": st.lognorm, "Nakagami": st.nakagami, "Normal": st.norm, "Rayleigh": st.rayleigh, "Rician": st.rice, "t location-scale": st.t, "Weibull": st.weibull_min, } if len(dists)> 0: keys_list = list(distributions.keys()) keys_list2 = [] for i in dists: keys_list2.append(keys_list[i]) distributions = dict((k, distributions[k]) for k in keys_list2 if k in distributions) # Best holders best_distributions = [] # Estimate distribution parameters from data for name, distribution in distributions.items(): try: # Ignore warnings from data that can't be fit with warnings.catch_warnings(): warnings.filterwarnings("ignore") # fit dist to data params = distribution.fit(data) # Separate parts of parameters arg = params[:-2] loc = params[-2] scale = params[-1] # Calculate fitted PDF and error with fit in distribution n = len(data) k = len(params) log_likelihood = ( distribution(loc=loc, scale=scale, *arg).logpdf(data).sum() ) if criterion == "BIC": criterion_value = -2 * log_likelihood + np.log(n) * k elif criterion == "AIC": criterion_value = 2 * k - 2 * log_likelihood elif criterion == "ML": criterion_value = log_likelihood # identify if this distribution is better best_distributions.append((distribution, params, criterion_value)) except Exception: pass return sorted(best_distributions, key=lambda x: x[2])[0]
# %% pseudodata
[docs] def pseudodata(data): """ Compute the pseudo-observations for the given data (transforms data to standard uniform margins) Arguments: *data* : The data which has to be converted into pseudo data, provided as a numpy array where each column contains a separate variable (eg. x1,x2,...,xn) Returns: *u* : Pseudo data, provided as a numpy array where each column contains a separate variable (eg. u1,u2,...,un) """ ranks = np.apply_along_axis(rankdata, axis=0, arr=data) n = data.shape[0] u = (ranks - 1) / (n - 1) return u
# %%
[docs] def pseudodiscr(xcdf, xpmf): """ Compute the pseudo-observations for the given variable that is discrete. Arguments: *xcdf* : The cumulative distribution function of the variable, calculated based on the best fit discrete distribution, provided as a 1-d numpy array. *xpmf* : The probability mass function of the variable, calculated based on the best fit discrete distribution, provided as a 1-d numpy array. Returns: *ui* : Pseudo data of a given variable provided as a 1-d numpy array. """ # Reference: Mitskopoulos et al. 2022 u113 = xcdf - xpmf ru = np.random.uniform(0, 1, len(xcdf)) ui = u113 + ru * (xcdf - u113) return ui