# -*- coding: utf-8 -*-
"""
Created on Thu Feb 22 16:40:09 2024
"""
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import scipy.stats as st
from itertools import product
import networkx as nx
import matplotlib.pyplot as plt
from vinecopulas.bivariate import *
# %% Copulas
[docs]
copulas = {
1: "Gaussian",
2: "Gumbel0",
3: "Gumbel90",
4: "Gumbel180",
5: "Gumbel270",
6: "Clayton0",
7: "Clayton90",
8: "Clayton180",
9: "Clayton270",
10: "Frank",
11: "Joe0",
12: "Joe90",
13: "Joe180",
14: "Joe270",
15: "Student",
}
# %% fitting vinecopula
[docs]
def fit_vinecop(u1, copsi, vine="R", printing=True):
"""
Fit a regular vine copula to data.
Arguments:
*u1* : the data, provided as a numpy array where each column contains a separate variable (eg. u1,u2,...,un), which have already been transferred to standard uniform margins (0<= u <= 1)
*copsi* : A list of integers referring to the copulae of interest for which the fit has to be evaluated in the vine copula. eg. a list of [1, 10] refers to the Gaussian and Frank copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
*vine* : The type of vine copula that needs to be fit, either 'R', 'D', or 'C'
*printing*: True if the fitted copula should be printed and False if not
Returns:
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*p* : Parameters of the bivariate copulae provided as a triangular matrix.
*c* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
"""
# Reference: Dißmann et al. 2013
v1 = [] # list for variable 1
v2 = [] # list for variable 2
tauabs = [] # list for the absolute kendal tau between v1 and v2
dimen = u1.shape[1] # number of variables (number of columns)
for i in range(dimen - 1):
for j in range(i + 1, dimen):
v1.append(int(i)) # add variable to v1
v2.append(int(j)) # add variable to v2
tauabs.append(
abs(st.kendalltau(u1[:, i], u1[:, j])[0])
) # calculate the absolute kendall tau between v1 and v2 and add it to the taubs list
order1 = pd.DataFrame(
{"v1": v1, "v2": v2, "tauabs": tauabs}
) # put the v1, v2, and tauabs list into a dataframe for the first tree
order1 = order1.sort_values(by="tauabs", ascending=False).reset_index(
drop=True
) # sort this dataframe from highest to lowest tauabs
# R vine
if vine == "R":
inde = [] # list to put used rows of order1 in
for i in range(
len(order1)
): # loop through all the rows to include those pairs with the highest ktau first
if i == 0:
order2 = order1.head(
1
) # create a new dataframe where the row with the highest ktau is the first row
else:
if (
order1.v1[i] in list(order1.v2[:i])
or order1.v1[i] in list(order1.v1[:i])
) and (
order1.v2[i] in list(order1.v2[:i])
or order1.v2[i] in list(order1.v1[:i])
): # see if both v1i and v2i are already in order2, first rows with unique variables need to be added to ensure all variables are included
continue
else:
inde.append(i) # add used rows to inde
order2 = pd.concat(
[order2, order1.loc[i].to_frame().T], ignore_index=True
) # add row to dataframe order2
if len(order2) < (
dimen - 1
): # the first tree will have the number of variables - 1 number of rows.
for i in range(
1, len(order1)
): # loop through all the combinations again and add based on the highest ktauabs
if i in inde: # check if row has not been added before
continue
lst = (
list(order2.v2[order2.v1 == order1.v2[i]])
+ list(order2.v1[order2.v2 == order1.v2[i]])
+ list(order2.v2[order2.v1 == order1.v1[i]])
+ list(order2.v1[order2.v2 == order1.v1[i]])
)
l1 = list(order2.v2[order2.v1 == order1.v2[i]]) + list(
order2.v1[order2.v2 == order1.v2[i]]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(order2.v2[order2.v1 == j]) + list(
order2.v1[order2.v2 == j]
)
try:
ln.remove(order1.v2[i])
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
l2 = list(order2.v2[order2.v1 == order1.v1[i]]) + list(
order2.v1[order2.v2 == order1.v1[i]]
)
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(order2.v2[order2.v1 == j]) + list(
order2.v1[order2.v2 == j]
)
try:
ln.remove(order1.v1[i])
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1): # ensure that there are no groups of variables that all have a connection with one another
# continue
if len(lst) == len(
set(lst)
): # check that there are no 'triangles', e.g. three variables that are alll connected with eachother
order2 = pd.concat(
[order2, order1.loc[i].to_frame().T], ignore_index=True
) # add row if conditions are met
if (
len(order2) == dimen - 1
): # end loop when desired number of rows (number of variables - 1) has been met
break
# D vine
elif vine == "D":
inde = [] # lsit to put used variables in
for j in range(dimen - 1):
if j == 0:
order2 = order1.head(
1
) # loop through all the rows to include those pairs with the highest ktau first
vi = order2.v1[0] # select the first variable included in this row
vj = order2.v2[0] # select the second variable included in this row
else:
for i in range(len(order1)):
if (
order1.v1[i] == vj
or order1.v2[i] == vi
or order1.v1[i] == vi
or order1.v2[i] == vj
): # find the rows which include at least one of the two variables from the previous edge
if (
order1.v1[i] in inde or order1.v2[i] in inde
): # check if variables have already been connected to other variables twice
continue
if (order1.v1[i] == vi and order1.v2[i] == vj) or (
order1.v1[i] == vj and order1.v2[i] == vi
): # skip rows that have already been used
continue
else:
order2 = pd.concat(
[order2, order1.loc[i].to_frame().T], ignore_index=True
) # if conditions are met, add this to the dataframe of the first tree
if (
order1.v1[i] == vj or order1.v2[i] == vj
): # check which value has been used twice already
inde.append(vj) # add this value to inde
if order1.v1[i] == vj:
vj = order1.v2[
i
] # select the new edge to connect to
else:
vj = order1.v1[
i
] # select the new edge to connect to
elif (
order1.v1[i] == vi or order1.v2[i] == vi
): # check which value has been used twice already
inde.append(vi) # add this value to inde
if order1.v1[i] == vi:
vi = order1.v2[
i
] # select the new edge to connect to
else:
vi = order1.v1[
i
] # select the new edge to connect to
elif vine == "C":
taus = [] # list to put the sum of all tauabs in for a specific variable
for i in range(dimen):
taus.append(
sum(order1[(order1.v1 == i) | (order1.v2 == i)].tauabs)
) # calculate the sum of all taus for a specific variable
i = np.where(np.array(taus) == max(taus))[0][
0
] # find where the sum of the taus is the highest to find the vairable to place in the center of the first tree
order2 = order1[(order1.v1 == i) | (order1.v2 == i)].reset_index(
drop=True
) # select the rows that include this variable
order1 = order2 # make order1 == the first tree
del order2
rhos = [] # list for the rhos
node = [] # list for the nodes
cops = [] # list for the copulas
v1_1 = [] # list for the 1st nodes
v2_1 = [] # list for the 2nd nodes
aics = [] # list for the AIC's
for i in range(len(order1)):
v1i = int(order1.v1[i]) # first node
v2i = int(order1.v2[i]) # second node
u3 = np.vstack(
(u1[:, v1i], u1[:, v2i])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
node.append([v1i, v2i]) # create final node
v1_1.append(u1[:, v1i]) # add array of first node
v2_1.append(u1[:, v2i]) # add array of second node
v1_1 = np.array(v1_1).T # v1
v2_1 = np.array(v2_1).T # v2
# add information to dataframe of the first tree
order1["rhos"] = rhos
order1["node"] = node
order1["tree"] = 0
order1["cop"] = cops
order1["AIC"] = aics
# set up variables for the second tree
v1 = []
v2 = []
ktau = []
rhos = []
node = []
cops = []
v1_k = []
v2_k = []
aics = []
for i in range(
len(order1) - 1
): # loop through the first tree to identify all possible combination of nodes
v1i = int(order1.v1[i]) # parent node 1 from nodei in tree
v2i = int(order1.v2[i]) # parent node 2 from nodei in tree
copi = int(order1.cop[i]) # copula of nodei
pari = order1.rhos[i] # parameters of nodei
for j in (
np.where(
np.array([item == v1i for item in list(order1.v1[i + 1 :])])
| np.array([item == v1i for item in list(order1.v2[i + 1 :])])
| np.array([item == v2i for item in list(order1.v1[i + 1 :])])
| np.array([item == v2i for item in list(order1.v2[i + 1 :])])
)[0]
+ i
+ 1
): # see if possible connection between nodei and nodej
v1j = int(order1.v1[j]) # parent node 1 from nodej in tree
v2j = int(order1.v2[j]) # parent node 2 from nodej in tree
copj = int(order1.cop[j]) # copula of nodei
parj = order1.rhos[j] # parameters of nodei
v1.append(order1.node[i]) # parent node for next tree
v2.append(order1.node[j]) # parent node for next tree
lst = order1.node[i] + order1.node[j] # list of all variables in node
s = max(
set(lst), key=lst.count
) # variable that is common in both parent nodes
# parent node values
ui1 = v1_1[:, i]
ui2 = v2_1[:, i]
uj1 = v1_1[:, j]
uj2 = v2_1[:, j]
# defining which parent node the conditional CDF needs to be based on
if v1i == s:
uni = 1
vi1 = v2i
else:
uni = 2
vi1 = v1i
if v1j == s:
unj = 1
vj1 = v2j
else:
unj = 2
vj1 = v1j
# calculate the conditional CDF
v1igs = hfunc(copi, ui1, ui2, pari, un=uni)
v2jgs = hfunc(copj, uj1, uj2, parj, un=unj)
ktau.append(
abs(st.kendalltau(v1igs, v2jgs)[0])
) # calculate the absolute kendall tau between v1igs and v2jgs, add it to the ktau list
# add the parent node values
v1_k.append(v1igs)
v2_k.append(v2jgs)
# format the final node
node.append([vi1, vj1, "g", s])
k = 2 # second tree
orderk = pd.DataFrame(
{"v1": v1, "v2": v2, "tauabs": ktau, "node": node}
) # put the v1, v2, tauabs, and the node in list into a dataframe for tree k
if vine == "R" or vine == "D":
if (
len(orderk) > dimen - k
): # the tree will have the number of variables - k number of rows, if this is exceeded in orderk, a selection has to be made based on kendal tau
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
indexes = list(
orderk.index
) # create a list the original order of the dataframe prior to sorting
orderk = orderk.reset_index(drop=True) # reset the index
inde = (
[]
) # list to put used rows of orderk in based on their oroginal position in the dataframe
inde2 = [] # list to put used rows of orderk i
for i in range(len(orderk)):
if i == 0:
order = orderk.head(
1
) # loop through all the rows to include those pairs with the highest ktau first
inde.append(
indexes[i]
) # add used rows to inde (original row value)
else:
if (
orderk.v1[i] in list(orderk.v2[:i])
or orderk.v1[i] in list(orderk.v1[:i])
) and (
orderk.v2[i] in list(orderk.v2[:i])
or orderk.v2[i] in list(orderk.v1[:i])
):
continue
else:
inde.append(
indexes[i]
) # add used rows to inde (original row value)
inde2.append(i) # add used rows to inde
order = pd.concat(
[order, orderk.loc[i].to_frame().T], ignore_index=True
) # if conditions are met, add this to the dataframe of tree k
if len(order) < (dimen - k):
for i in range(1, len(orderk)):
if i in inde2:
continue
lst = (
list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[i])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[i])
]
)
+ list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[i])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[i])
]
)
)
l1 = list(
order.v2.astype(str)[order.v1.astype(str) == str(orderk.v1[i])]
) + list(
order.v1.astype(str)[order.v2.astype(str) == str(orderk.v1[i])]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(order.v1.astype(str)[order.v2.astype(str) == j])
try:
ln.remove(str(orderk.v1[i]))
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
l2 = list(
order.v2.astype(str)[order.v1.astype(str) == str(orderk.v2[i])]
) + list(
order.v1.astype(str)[order.v2.astype(str) == str(orderk.v2[i])]
)
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(order.v1.astype(str)[order.v2.astype(str) == j])
try:
ln.remove(str(orderk.v2[i]))
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1): # ensure that there are no groups of variables that all have a connection with one another
# continue
if len(lst) == len(set(lst)):
inde.append(
indexes[i]
) # add used rows to inde (original row value)
order = pd.concat(
[order, orderk.loc[i].to_frame().T], ignore_index=True
) # if conditions are met, add this to the dataframe of tree k
if (
len(order) == dimen - k
): # end loop when desired number of rows (number of variables - k) has been met
break
orderk = order # maker order = to tree k
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array([v1_k[ind] for ind in inde]).T # sort array of first node
v2_k = np.array([v2_k[ind] for ind in inde]).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
else:
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array(
[v1_k[ind] for ind in orderk.index]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in orderk.index]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
if vine == "C":
if len(orderk) > dimen - k:
subnodes = np.unique(
np.stack((np.array(orderk.v1), np.array(orderk.v2)))
) # see all the unique parent nodes
taus = [] # list to put the sum of all tauabs in for a specific nodes
for i in range(len(subnodes)):
orderksub = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
]
taus.append(
sum(orderksub.tauabs)
) # calculate the sum of all taus for a specific variable
i = np.where(np.array(taus) == max(taus))[0][
0
] # find where the sum of the taus is the highest to find the vairable to place in the center of the first tree
orderk = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
] # select rows where this node is included
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
inde = list(orderk.index)
v1_k = np.array([v1_k[ind] for ind in inde]).T # sort array of first node
v2_k = np.array([v2_k[ind] for ind in inde]).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
else:
orderk = orderk.sort_values(by="tauabs", ascending=False)
v1_k = np.array([v1_k[ind] for ind in orderk.index]).T
v2_k = np.array([v2_k[ind] for ind in orderk.index]).T
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
for i in range(len(order2)):
u3 = np.vstack(
(v1_2[:, i], v2_2[:, i])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
# add information to dataframe of tree k
order2["rhos"] = rhos
order2["cop"] = cops
order2["AIC"] = aics
# loop through remaining trees if there are more than 3 variables
if dimen > 3:
for k in range(3, dimen):
order = locals()[
"order" + str(k - 1)
].copy() # select the struture of the previous tree
v1s = locals()[
"v1_" + str(k - 1)
].copy() # select the first nodes of the previous tree
v2s = locals()[
"v2_" + str(k - 1)
].copy() # select the second nodes of the previous tree
# create lists for the variables
v1_k = []
v2_k = []
v1 = []
v2 = []
ktau = []
rhos = []
cops = []
node = []
lk = []
aics = []
rk = []
for i in range(len(order) - 1):
v1i = order.v1[i].copy() # parent node 1 from nodei in tree
v2i = order.v2[i].copy() # parent node 2 from nodei in tree
copi = int(order.cop[i]) # copula of nodei
pari = order.rhos[i] # parameters of nodei
for j in (
np.where(
np.array([item == v1i for item in list(order.v1[i + 1 :])])
| np.array([item == v1i for item in list(order.v2[i + 1 :])])
| np.array([item == v2i for item in list(order.v1[i + 1 :])])
| np.array([item == v2i for item in list(order.v2[i + 1 :])])
)[0]
+ i
+ 1
): # see if possible connection between nodei and nodej
v1i = order.v1[i].copy() # parent node 1 from nodei in tree
v2i = order.v2[i].copy() # parent node 2 from nodei in tree
copj = int(order.cop[j]) # copula of nodei
parj = order.rhos[j] # parameters of nodei
nodei = order.node[i] # nodei
nodej = order.node[j] # nodej
v1j = order.v1[j].copy() # parent node 1 from nodej in tree
v2j = order.v2[j].copy() # parent node 2 from nodej in tree
v1.append(nodei) # new parent nodes 1
v2.append(nodej) # new parent nodes 2
n = 2
ri = nodei[n + 1 :] # select the values on the right side of node i
rj = nodej[n + 1 :] # select the values on the right side of node i
if "g" in v1j:
v1j.remove("g")
v2j.remove("g")
v1i.remove("g")
v2i.remove("g")
# define left and right side of the node in tree k
if rj == ri:
if len(v1j) == 2:
lst = nodei[:n] + nodej[:n]
r3 = [max(set(lst), key=lst.count)]
li = [value for value in nodei[:n] if value not in r3]
lj = [value for value in nodej[:n] if value not in r3]
r = list(np.unique(ri + r3))
else:
lst = v1i[:n] + v2i[:n] + v1j[:n] + v2j[:n]
for s in ri:
lst = [x for x in lst if x != s]
r3 = [max(set(lst), key=lst.count)]
li = [value for value in nodei[:n] if value not in r3]
lj = [value for value in nodej[:n] if value not in r3]
r = list(np.unique(ri + r3))
l = list(np.unique(li + lj))
else:
r = list(np.unique(ri + rj))
li = [value for value in nodei[:n] if value not in rj]
lj = [value for value in nodej[:n] if value not in ri]
if li == lj:
lst = v1i[:n] + v2i[:n] + v1j[:n] + v2j[:n]
lst = [x for x in lst if x != li[0]]
l3 = [min(set(lst), key=lst.count)]
l = list(np.unique(li + l3))
else:
l = list(np.unique(li + lj))
# select the parent node values
ui1 = v1s[:, i]
ui2 = v2s[:, i]
uj1 = v1s[:, j]
uj2 = v2s[:, j]
if set(r).issubset(set(v1i)):
uni = 1
elif set(r).issubset(set(v2i)):
uni = 2
elif set(rj).issubset(set(v2i[1:])):
uni = 2
elif set(rj).issubset(set(v1i[1:])):
uni = 1
if set(r).issubset(set(v1j)):
unj = 1
elif set(r).issubset(set(v2j)):
unj = 2
elif set(ri).issubset(set(v2j[1:])):
unj = 2
elif set(ri).issubset(set(v1j[1:])):
unj = 1
# calculate the conditional CDF
v1igs = hfunc(copi, ui1, ui2, pari, un=uni)
v2jgs = hfunc(copj, uj1, uj2, parj, un=unj)
ktau.append(
abs(st.kendalltau(v1igs, v2jgs)[0])
) # calculate the absolute kendall tau between v1igs and v2jgs, add it to the ktau list
# add the parent node values
v1_k.append(v1igs)
v2_k.append(v2jgs)
del uj1, ui1, ui2, uj2
node.append(l + ["g"] + r) # set node name
lk.append(l) # add left side of node
rk.append(r) # add rigth side of node
orderk = pd.DataFrame(
{"v1": v1, "v2": v2, "tauabs": ktau, "node": node, "l": lk, "r": rk}
) # put information in dataframe for tree k
if vine == "R" or vine == "D":
if (
len(orderk) > dimen - k
): # the tree will have the number of variables - k number of rows, if this is exceeded in orderk, a selection has to be made based on kendal tau
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
indexes = list(
orderk.index
) # create a list the original order of the dataframe prior to sorting
orderk = orderk.reset_index(drop=True) # reset the index
inde = (
[]
) # list to put used rows of orderk in based on their oroginal position in the dataframe
inde2 = [] # list to put used rows of orderk i
for i in range(len(orderk)):
if i == 0:
order = orderk.head(
1
) # loop through all the rows to include those pairs with the highest ktau first
inde.append(
indexes[i]
) # add used rows to inde (original row value)
l = orderk.l[i]
else:
if (
orderk.v1[i] in list(orderk.v2[:i])
or orderk.v1[i] in list(orderk.v1[:i])
) and (
orderk.v2[i] in list(orderk.v2[:i])
or orderk.v2[i] in list(orderk.v1[:i])
):
continue
else:
inde.append(
indexes[i]
) # add used rows to inde (original row value)
inde2.append(i) # add used rows to inde
order = pd.concat(
[order, orderk.loc[i].to_frame().T],
ignore_index=True,
) # if conditions are met, add this to the dataframe of tree k
l = l + orderk.l[i]
if len(order) < (dimen - k):
for i in range(1, len(orderk)):
if i in inde2:
continue
lst = (
list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[i])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[i])
]
)
+ list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[i])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[i])
]
)
)
l1 = list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[i])
]
) + list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[i])
]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(
order.v1.astype(str)[order.v2.astype(str) == j]
)
try:
ln.remove(str(orderk.v1[i]))
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
l2 = list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[i])
]
) + list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[i])
]
)
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(
order.v1.astype(str)[order.v2.astype(str) == j]
)
try:
ln.remove(str(orderk.v2[i]))
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1):
# continue
if len(lst) == len(set(lst)):
order = pd.concat(
[order, orderk.loc[i].to_frame().T],
ignore_index=True,
) # if conditions are met, add this to the dataframe of tree k
inde.append(
indexes[i]
) # add used rows to inde (original row value)
if (
len(order) == dimen - k
): # end loop when desired number of rows (number of variables - k) has been met
break
orderk = order
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array(
[v1_k[ind] for ind in inde]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in inde]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
for j in range(len(orderk)):
u3 = np.vstack(
(v1_k[:, j], v2_k[:, j])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
# add information to dataframe of tree k
orderk["rhos"] = rhos
orderk["cop"] = cops
orderk["AIC"] = aics
locals()["v1_" + str(k)] = v1_k
locals()["v2_" + str(k)] = v2_k
locals()["order" + str(k)] = orderk
else:
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array(
[v1_k[ind] for ind in orderk.index]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in orderk.index]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
for j in range(len(orderk)):
u3 = np.vstack(
(v1_k[:, j], v2_k[:, j])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
# add information to dataframe of tree k
orderk["rhos"] = rhos
orderk["cop"] = cops
orderk["AIC"] = aics
locals()["v1_" + str(k)] = v1_k
locals()["v2_" + str(k)] = v2_k
locals()["order" + str(k)] = orderk
if vine == "C":
if len(orderk) > dimen - k:
subnodes = np.unique(
np.stack((np.array(orderk.v1), np.array(orderk.v2)))
) # see all the unique parent nodes
taus = (
[]
) # list to put the sum of all tauabs in for a specific nodes
for i in range(len(subnodes)):
orderksub = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
]
taus.append(
sum(orderksub.tauabs)
) # calculate the sum of all taus for a specific variable
i = np.where(np.array(taus) == max(taus))[0][
0
] # find where the sum of the taus is the highest to find the vairable to place in the center of the first tree
orderk = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
] # select rows where this node is included
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
inde = list(orderk.index)
v1_k = np.array(
[v1_k[ind] for ind in inde]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in inde]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_k = v1_k.copy()
v2_k = v2_k.copy()
for j in range(len(orderk)):
u3 = np.vstack(
(v1_k[:, j], v2_k[:, j])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
# add information to dataframe of tree k
orderk["rhos"] = rhos
orderk["cop"] = cops
orderk["AIC"] = aics
locals()["v1_" + str(k)] = v1_k
locals()["v2_" + str(k)] = v2_k
locals()["order" + str(k)] = orderk
else:
orderk = orderk.sort_values(by="tauabs", ascending=False)
v1_k = np.array([v1_k[ind] for ind in orderk.index]).T
v2_k = np.array([v2_k[ind] for ind in orderk.index]).T
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
for j in range(len(orderk)):
u3 = np.vstack(
(v1_k[:, j], v2_k[:, j])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
# add information to dataframe of tree k
orderk["rhos"] = rhos
orderk["cop"] = cops
orderk["AIC"] = aics
locals()["v1_" + str(k)] = v1_k
locals()["v2_" + str(k)] = v2_k
locals()["order" + str(k)] = orderk
order = pd.DataFrame(columns=order1.columns) # dataframe to add all trees to
for i in range(1, dimen):
order = pd.concat([order, locals()["order" + str(i)]]).reset_index(
drop=True
) # adding trees in dataframe
# create array for the tree structure and copula matrices
a = np.empty((dimen, dimen))
c = np.empty((dimen, dimen))
a[:] = np.nan
c[:] = np.nan
order["used"] = 0 # set used to 0 for nodes that have not been used
for i in list(range(dimen - 1))[
::-1
]: # create the matix for the tree structure starting with the last tree
k1 = sorted(
np.array(
order[(order.tree == i) & (order["used"] == 0)].node.iloc[0][:2]
).astype(int)
)[::-1]
order.loc[(order["tree"] == i) & (order["used"] == 0), "used"] = 1
t1 = i - 1
ii = dimen - 2 - i
a[i : dimen - ii, ii] = k1
s = k1[-1]
for j in list(range(0, i))[::-1]: # continueing from tree i to the first tree
orde = order[(order.tree == j) & (order["used"] == 0)]
for k in range(len(orde)):
arr = np.array(orde.node.iloc[k][:2]).astype(int)
if np.isin(s, arr) == True:
inde = orde.iloc[k].name
a[j, ii] = arr[arr != s][0]
order["used"][inde] = 1
a[0, dimen - 1] = a[0, dimen - 2] # set first sample in sampling order
orderk = pd.DataFrame(columns=order.columns)
# create array for the copula parameter matrix
p = np.empty((dimen, dimen))
p[:] = np.nan
p = p.astype(object)
# fill array p with the parameters and c with the copulas, corresponding to the structure in c
for i in list(range(dimen - 1)):
orde = order[order.tree == i]
for k in list(range(dimen - 1 - i)):
ak = a[:, k]
akn = np.array([ak[-1 - k], ak[i]]).astype(int)
for j in range(len(orde)):
arr = np.array(orde.node.iloc[j][:2]).astype(int)
if sum(np.isin(akn, arr)) == 2:
orderj = order.loc[[orde.index[j]]]
p[i, k] = orderj.rhos.iloc[0]
c[i, k] = orderj.cop.iloc[0]
if i == 0:
orderj.node.iloc[0] = list(akn)
else:
orderj.node.iloc[0] = (
list(akn) + ["|"] + list((ak.astype(int)[:i])[::-1])
)
orderk = pd.concat([orderk, orderj]).reset_index(drop=True)
# print the copula structure if print == True
if printing == True:
for i in list(range(0, dimen - 1)):
orde = orderk[orderk.tree == i].reset_index(drop=True)
print("** Tree: ", i + 1)
for j in range(len(orde)):
if i != 0:
nodej = ",".join(map(str, orde.node[j])).replace(",|,", "|")
else:
nodej = ",".join(map(str, orde.node[j]))
print(
nodej,
" ---> ",
copulas[int(orde.cop[j])],
": parameters = ",
orde.rhos[j],
)
return a, p, c
[docs]
def density_vinecop(u, M, P, C):
"""
Computes the density function of a vine copula.
Arguments:
*u* : A 2-d numpy array containing the samples for which the PDF will be calculated.
*M* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*P* : Parameters of the bivariate copulae provided as a triangular matrix.
*C* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
Returns:
*F* : A 1-d numpy array containing the probability density function of the vine copula
"""
U = u[:,list(np.diag(M[::-1])[::-1].astype(int))]
a = M.copy()
p = P.copy()
c = C.copy()
s = len(u)
Ms = np.flipud(a) # flip structure matrix
P = np.flipud(p) # flip parameter matrix
C = np.flipud(c) # flip copula matrix
replace = {} # dictionary for relabeling martix
for i in range(int(max(np.unique(Ms)) + 1)):
val = max(np.unique(Ms)) - i
replace.update({Ms[i, i]: val}) # relabel
Ms = np.nan_to_num(Ms, nan=int(max(np.unique(Ms)) + 1))
replace_func = np.vectorize(
lambda x: replace.get(x, x)
) # Create a vectorized function for replacement
M = replace_func(Ms) # relabel
M[M == np.max(Ms)] = np.nan
Mm = M.copy()
# max matrix
for i in range(M.shape[0]):
for k in range(M.shape[0]):
if k == i:
continue
if i == 0:
continue
Mm[i, k] = max(Mm[i:, k])
# Vdirect
Vdir = np.empty((s, M.shape[0], M.shape[0]))
Vdir[:] = np.nan
# Vindirect
Vindir = np.empty((s, M.shape[0], M.shape[0]))
Vindir[:] = np.nan
# Z2
Z2 = np.empty((s, M.shape[0], M.shape[0]))
Z2[:] = np.nan
# Z1
Z1 = np.empty((s, M.shape[0], M.shape[0]))
Z2[:] = np.nan
Vdir[:, -1, :] = np.flip(U.copy(), 1)
X = np.flip(U.copy(), 1)
n = M.shape[0] - 1
F = 1
for k in list(reversed((range(0,n)))):
for i in range(k+1, n+1)[::-1]:
Z1[:, i, k] = Vdir[:, i, k]
if M[i, k] == Mm[i, k]:
Z2[:, i, k] = Vdir[:, i, int(n - Mm[i, k])]
else:
Z2[:, i, k] = Vindir[:, i, int(n - Mm[i, k])]
F = F * PDF(int(C[i, k]),np.vstack((Z1[:, i, k], Z2[:, i, k])).T,P[i, k])
Vdir[:, int(i - 1), k] = hfunc(
int(C[i, k]), Z1[:, i, k], Z2[:, i, k], P[i, k], un=2
)
Vindir[:, int(i - 1), k] = hfunc(
int(C[i, k]), Z1[:, i, k], Z2[:, i, k], P[i, k], un=1
)
return F
# %% fitting vine copula with sspecific structure
[docs]
def fit_vinecopstructure(u1, copsi, a):
"""
Fit a regular vine copula to data based on a known vine structure matrix.
Arguments:
*u1* : the data, provided as a numpy array where each column contains a seperate variable (eg. u1,u2,...,un), which have already been transferred to standard uniform margins (0<= u <= 1)
*copsi* : A list of integers referring to the copulae of interest for which the fit has to be evauluated in the vine copula. eg. a list of [1, 10] refers to the Gaussian and Frank copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
Returns:
*p* : Parameters of the bivariate copulae provided as a triangular matrix.
*c* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
"""
dimen = a.shape[0] # number of variables (number of columns)
order = pd.DataFrame(
columns=["node", "l", "r", "tree"]
) # dataframe for vinecopula information
s = 0
for i in list(range(dimen - 1)):
for k in list(range(dimen - 1 - i)):
ak = a[:, k] # Edge
akn = np.array([ak[-1 - k], ak[i]]).astype(int) # Edge
if i == 0:
single_row_values = {
"node": list(akn),
"l": akn[0],
"r": akn[1],
"tree": i,
}
else:
single_row_values = {
"node": list(akn) + ["|"] + list((ak.astype(int)[:i])[::-1]),
"l": list(akn),
"r": list((ak.astype(int)[:i])[::-1]),
"tree": i,
}
order.loc[s] = single_row_values
s = s + 1
for t in list(range(dimen - 1)): # loop through trees
orderk = order[order.tree == t].reset_index(drop=True) # select tre
if t == 0: # first tree
orderk["v1"] = orderk.l
orderk["v2"] = orderk.r
rhos = []
v1_1 = []
v2_1 = []
cops = []
tauabs = []
for j in range(len(orderk)):
v1i = int(orderk.v1[j]) # first node
v2i = int(orderk.v2[j]) # second node
tauabs.append(
abs(st.kendalltau(u1[:, v1i], u1[:, v2i])[0])
)
u3 = np.vstack(
(u1[:, v1i], u1[:, v2i])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
v1_1.append(u1[:, v1i]) # add array of first node
v2_1.append(u1[:, v2i]) # add array of second node
v1_1 = np.array(v1_1).T # v1
v2_1 = np.array(v2_1).T # v2
# add information to dataframe of the first tree
orderk["rhos"] = rhos
orderk["cop"] = cops
orderk["tauabs"] = tauabs
orderk = orderk.sort_values(by="tauabs", ascending=False)
v1_1 = v1_1[:, orderk.index] # sort array of first node
v2_1 = v2_1[:, orderk.index] # sort array of second node
orderk = orderk.reset_index(
drop=True
)
else:
v1k = []
v2k = []
tauabs = []
for j in range(len(orderk)):
orderk2 = locals()["order" + str(t)] # Define possible nodes of edge
l = orderk.l[j] + orderk.r[j]
subnodes = []
for k in range(len(orderk2)):
subnodes.append(sum(1 for item in orderk2.node[k] if item in l))
subnodes = np.array(subnodes) == len(l) - 1
orderk2 = orderk2[subnodes].reset_index(drop=True)
v1k.append(orderk2.node[0]) # node 1
v2k.append(orderk2.node[1]) # node 2
orderk["v1"] = v1k
orderk["v2"] = v2k
orderk2 = locals()["order" + str(t)].reset_index(drop=True)
v1s = locals()["v1_" + str(t)].copy()
v2s = locals()["v2_" + str(t)].copy()
v1_k = []
v2_k = []
rhos = []
cops = []
for k in range(len(orderk)): # fitting copulas
r = orderk.r[k]
nodei = orderk.v1[k] # nodei
nodej = orderk.v2[k] # nodej
i = orderk2[orderk2["node"].apply(lambda x: x == nodei)].index[0]
j = orderk2[orderk2["node"].apply(lambda x: x == nodej)].index[0]
copj = int(orderk2.cop[j]) # copula of nodej
parj = orderk2.rhos[j] # parameters of nodej
copi = int(orderk2.cop[i]) # copula of nodei
pari = orderk2.rhos[i] # parameters of nodei
ri = orderk2.r[i]
rj = orderk2.r[j]
v1i = orderk2.v1[i] # parent node 1 from nodei in tree
v2i = orderk2.v2[i]
v1j = orderk2.v1[j] # parent node 1 from nodej in tree
v2j = orderk2.v2[j] # parent node 2 from nodej in tree
ui1 = v1s[:, i]
ui2 = v2s[:, i]
uj1 = v1s[:, j]
uj2 = v2s[:, j]
if t > 1:
if "g" in v1j:
v1j.remove("g")
v2j.remove("g")
v1i.remove("g")
v2i.remove("g")
if set(r).issubset(set(v1i)):
uni = 1
elif set(r).issubset(set(v2i)):
uni = 2
elif set(rj).issubset(set(v2i[1:])):
uni = 2
elif set(rj).issubset(set(v1i[1:])):
uni = 1
if set(r).issubset(set(v1j)):
unj = 1
elif set(r).issubset(set(v2j)):
unj = 2
elif set(ri).issubset(set(v2j[1:])):
unj = 2
elif set(ri).issubset(set(v1j[1:])):
unj = 1
else:
lst = orderk2.node[i] + orderk2.node[j]
s = max(set(lst), key=lst.count)
ui1 = v1_1[:, i]
ui2 = v2_1[:, i]
uj1 = v1_1[:, j]
uj2 = v2_1[:, j]
if v1i == s:
uni = 1
vi1 = v2i
else:
uni = 2
vi1 = v1i
if v1j == s:
unj = 1
vj1 = v2j
else:
unj = 2
vj1 = v1j
# calculate the conditional CDF
v1igs = hfunc(copi, ui1, ui2, pari, un=uni)
v2jgs = hfunc(copj, uj1, uj2, parj, un=unj)
u3 = np.vstack(
(v1igs, v2jgs)
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
v1_k.append(v1igs)
v2_k.append(v2jgs)
orderk["rhos"] = rhos
orderk["cop"] = cops
locals()["v1_" + str(t + 1)] = np.array(v1_k).T
locals()["v2_" + str(t + 1)] = np.array(v2_k).T
locals()["order" + str(t + 1)] = orderk
order = pd.DataFrame(columns=orderk.columns)
for i in range(1, dimen):
order = pd.concat([order, locals()["order" + str(i)]]).reset_index(drop=True)
p = np.empty((dimen, dimen)) # parameter array
p[:] = np.nan
p = p.astype(object)
c = np.empty((dimen, dimen)) # copula array
c[:] = np.nan
for i in list(range(dimen - 1)): # fil the copula and parameter array
orde = order[order.tree == i]
for k in list(range(dimen - 1 - i)):
ak = a[:, k]
akn = np.array([ak[-1 - k], ak[i]]).astype(int)
for j in range(len(orde)):
arr = np.array(orde.node.iloc[j][:2]).astype(int)
if sum(np.isin(akn, arr)) == 2:
orderj = order.loc[[orde.index[j]]]
p[i, k] = orderj.rhos.iloc[0]
c[i, k] = orderj.cop.iloc[0]
return p, c
# %% Sampling vine copula
[docs]
def sample_vinecop(a, p, c, s):
"""
Generate random samples from an R-vine.
Arguments:
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*p* : Parameters of the bivariate copulae provided as a triangular matrix.
*c* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
*s* : number of samples to generate, provided as a positive scalar integer.
Returns:
*X2* : the randomly sampled data data, provided as a numpy array where each column contains samples of a seperate variable (eg. u1,u2,...,un).
"""
# Reference: Dißmann et al. 2013
Ms = np.flipud(a) # flip structure matrix
P = np.flipud(p) # flip parameter matrix
C = np.flipud(c) # flip copula matrix
replace = {} # dictionary for relabeling martix
for i in range(int(max(np.unique(Ms)) + 1)):
val = max(np.unique(Ms)) - i
# Ms[k,k]
replace.update({Ms[i, i]: val}) # relabel
Ms = np.nan_to_num(Ms, nan=int(max(np.unique(Ms)) + 1))
replace_func = np.vectorize(
lambda x: replace.get(x, x)
) # Create a vectorized function for replacement
M = replace_func(Ms) # relabel
M[M == np.max(Ms)] = np.nan
Mm = M.copy()
# max matrix
for i in range(M.shape[0]):
for k in range(M.shape[0]):
if k == i:
continue
if i == 0:
continue
Mm[i, k] = max(Mm[i:, k])
# Vdirect
Vdir = np.empty((s, M.shape[0], M.shape[0]))
Vdir[:] = np.nan
# Vindirect
Vindir = np.empty((s, M.shape[0], M.shape[0]))
Vindir[:] = np.nan
# Z2
Z2 = np.empty((s, M.shape[0], M.shape[0]))
Z2[:] = np.nan
# Z1
Z1 = np.empty((s, M.shape[0], M.shape[0]))
Z2[:] = np.nan
U = np.random.uniform(0, 1, (s, M.shape[0])) # random uniform
Vdir[:, -1, :] = U.copy()
X = np.flip(U.copy(), 1)
n = M.shape[0] - 1
# sampling algorithm
for k in range(n)[::-1]:
for i in range(k + 1, n + 1):
if M[i, k] == Mm[i, k]:
Z2[:, i, k] = Vdir[:, i, int(n - Mm[i, k])]
else:
Z2[:, i, k] = Vindir[:, i, int(n - Mm[i, k])]
Vdir[:, n, k] = hfuncinverse(
int(C[i, k]), Z2[:, i, k], Vdir[:, n, k], P[i, k], un=2
)
X[:, int(n - k)] = Vdir[:, n, k]
for i in range(k + 1, n + 1)[::-1]:
Z1[:, i, k] = Vdir[:, i, k]
Vdir[:, int(i - 1), k] = hfunc(
int(C[i, k]), Z1[:, i, k], Z2[:, i, k], P[i, k], un=2
)
Vindir[:, int(i - 1), k] = hfunc(
int(C[i, k]), Z1[:, i, k], Z2[:, i, k], P[i, k], un=1
)
# Put X in the original order of the data
replacedf = pd.DataFrame(list(replace.items()), columns=["Original", "Replacement"])
replacedf = replacedf.sort_values(by="Original")
X2 = np.array([])
for i in replacedf.Replacement:
if len(X2) == 0:
X2 = X[:, int(i)].reshape(len(X), 1)
else:
X2 = np.hstack((X2, X[:, int(i)].reshape(len(X), 1)))
return X2
# %% Sampling conditonal vine copula
[docs]
def sample_vinecopconditional(a, p, c, s, Xc):
"""
Generate conditional samples from an R-vine based on a provided sampling order
Arguments:
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*p* : Parameters of the bivariate copulae provided as a triangular matrix.
*c* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
*s* : number of samples to generate, provided as a positive scalar integer.
*XC*: the values of the variables on which the conditional sample has to generated, provided as a 1d array that contains the the values ordered in terms of the sampling order.
Returns:
*X2* : the randomly sampled data data, provided as a numpy array where each column contains samples of a seperate variable (eg. u1,u2,...,un).
"""
# Reference: Dißmann et al. 2013
Ms = np.flipud(a) # flip structure matrix
P = np.flipud(p) # flip parameter matrix
C = np.flipud(c) # flip copula matrix
replace = {} # dictionary for relabeling martix
for i in range(int(max(np.unique(Ms)) + 1)):
val = max(np.unique(Ms)) - i
# Ms[k,k]
replace.update({Ms[i, i]: val}) # relabel
Ms = np.nan_to_num(Ms, nan=int(max(np.unique(Ms)) + 1))
replace_func = np.vectorize(
lambda x: replace.get(x, x)
) # Create a vectorized function for replacement
M = replace_func(Ms) # relabel
M[M == np.max(Ms)] = np.nan
Mm = M.copy()
# max matrix
for i in range(M.shape[0]):
for k in range(M.shape[0]):
if k == i:
continue
if i == 0:
continue
Mm[i, k] = max(Mm[i:, k])
# Vdirect
Vdir = np.empty((s, M.shape[0], M.shape[0]))
Vdir[:] = np.nan
# Vindirect
Vindir = np.empty((s, M.shape[0], M.shape[0]))
Vindir[:] = np.nan
# Z2
Z2 = np.empty((s, M.shape[0], M.shape[0]))
Z2[:] = np.nan
# Z1
Z1 = np.empty((s, M.shape[0], M.shape[0]))
Z2[:] = np.nan
# combine random uniform data with conditional input
U = np.hstack(
(
np.random.uniform(0.0001, 0.99999, (s, M.shape[0] - len(Xc))),
np.flip(np.tile(Xc, (s, 1)).copy(), 1),
)
)
Vdir[:, -1, :] = U.copy()
X = np.flip(U.copy(), 1)
n = M.shape[0] - 1
# sampling algorithm
for k in range(n)[::-1]:
for i in range(k + 1, n + 1):
i = int(i)
if M[i, k] == Mm[i, k]:
Z2[:, i, k] = Vdir[:, i, int(n - Mm[i, k])]
else:
Z2[:, i, k] = Vindir[:, i, int(n - Mm[i, k])]
if k <= n - len(Xc):
Vdir[:, n, k] = hfuncinverse(
int(C[i, k]), Z2[:, i, k], Vdir[:, n, k], P[i, k], un=2
)
X[:, int(n - k)] = Vdir[:, n, k]
for i in range(k + 1, n + 1)[::-1]:
Z1[:, i, k] = Vdir[:, i, k]
Vdir[:, int(i - 1), k] = hfunc(
int(C[i, k]), Z1[:, i, k], Z2[:, i, k], P[i, k], un=2
)
Vindir[:, int(i - 1), k] = hfunc(
int(C[i, k]), Z1[:, i, k], Z2[:, i, k], P[i, k], un=1
)
# Put X in the original order of the data
replacedf = pd.DataFrame(list(replace.items()), columns=["Original", "Replacement"])
replacedf = replacedf.sort_values(by="Original")
X2 = np.array([])
for i in replacedf.Replacement:
if len(X2) == 0:
X2 = X[:, int(i)].reshape(len(X), 1)
else:
X2 = np.hstack((X2, X[:, int(i)].reshape(len(X), 1)))
return X2
# %%sampling orders
[docs]
def samplingorder(a):
"""
Provides all the different sampling orders that are possible for the fitted vine-copula.
Arguments:
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*p* : Parameters of the bivariate copulae provided as a triangular matrix.
*c* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see...refer to where this information would be)
Returns:
*sortingorder* : A list of the different sampling orders available for the fitted vine-copula
"""
dimen = a.shape[0] # dimension of data.
order = pd.DataFrame(
columns=["node", "l", "r", "tree"]
) # dataframe for vine copula structure.
s = 0
for i in list(range(dimen - 1)):
for k in list(range(dimen - 1 - i)):
ak = a[:, k] # edges
akn = np.array([ak[-1 - k], ak[i]]).astype(int)
if i == 0:
single_row_values = {
"node": list(akn),
"l": akn[0],
"r": akn[1],
"tree": i,
}
else:
single_row_values = {
"node": list(akn) + ["|"] + list((ak.astype(int)[:i])[::-1]),
"l": list(akn),
"r": list((ak.astype(int)[:i])[::-1]),
"tree": i,
}
order.loc[s] = single_row_values
s = s + 1
combinations = list(
product([True, False], repeat=dimen - 1)
) # all diffferent sampling routes.
sortingorder = []
for q in combinations:
a = np.empty((dimen, dimen))
a[:] = np.nan
order["used"] = 0
for i in list(range(dimen - 1))[::-1]:
k1 = sorted(
np.array(
order[(order.tree == i) & (order["used"] == 0)].node.iloc[0][:2]
).astype(int),
reverse=q[i],
)
order.loc[(order["tree"] == i) & (order["used"] == 0), "used"] = 1
ii = dimen - 2 - i
a[i : dimen - ii, ii] = k1
s = k1[-1]
for j in list(range(0, i))[::-1]:
orde = order[(order.tree == j) & (order["used"] == 0)]
for k in range(len(orde)):
arr = np.array(orde.node.iloc[k][:2]).astype(int)
if np.isin(s, arr) == True:
inde = orde.iloc[k].name
a[j, ii] = arr[arr != s][0]
order["used"][inde] = 1
a[0, dimen - 1] = a[0, dimen - 2]
sortingorder.append(list(np.diag(a[::-1])[::-1])) # add unique sampling orders
return sortingorder
# %%matrices of specific sampling order
[docs]
def samplingmatrix(a, c, p, sorder):
"""
Provides the triangular matrices for which the samples can be generated based on the specific sampling order.
Arguments:
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer rffers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*p* : Parameters of the bivariate copulae provided as a triangular matrix.
*c* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
*sorder* : A list containing the specific sampling order of interest
Returns:
*ai* : The vine tree structure, based on the sampling order, provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*pi* : Parameters of the bivariate copulae, based on the sampling order, provided as a triangular matrix.
*ci* : The types of the bivariate copulae, based on the sampling order, provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
"""
dimen = a.shape[0] # dimension of data
order = pd.DataFrame(
columns=["node", "par", "cop", "l", "r", "tree"]
) # dataframe for vine copula structure
s = 0
for i in list(range(dimen - 1)):
for k in list(range(dimen - 1 - i)):
ak = a[:, k]
akn = np.array([ak[-1 - k], ak[i]]).astype(int)
if i == 0:
single_row_values = {
"node": list(akn),
"par": p[i, k],
"cop": int(c[i, k]),
"l": akn[0],
"r": akn[1],
"tree": i,
}
else:
single_row_values = {
"node": list(akn) + ["|"] + list((ak.astype(int)[:i])[::-1]),
"par": p[i, k],
"cop": int(c[i, k]),
"l": list(akn),
"r": list((ak.astype(int)[:i])[::-1]),
"tree": i,
}
order.loc[s] = single_row_values
s = s + 1
ai = np.empty((dimen, dimen)) # array for vine copula structure
ai[:] = np.nan
order["used"] = 0
ci = np.empty((dimen, dimen)) # array for vine copula copulas
ci[:] = np.nan
pi = np.empty((dimen, dimen)) # array for parameters
pi[:] = np.nan
pi = pi.astype(object)
for i in list(range(dimen - 1))[::-1]:
if i == 0:
k1 = order[(order.tree == i) & (order["used"] == 0)].node.iloc[0]
else:
k1 = order[(order.tree == i) & (order["used"] == 0)].l.iloc[0]
if sorder[i + 1] == max(k1):
k1 = sorted(k1, reverse=False)
else:
k1 = sorted(k1, reverse=True)
ii = dimen - 2 - i
ai[i : dimen - ii, ii] = k1
ci[i, ii] = order[(order.tree == i) & (order["used"] == 0)].cop.iloc[0]
pi[i, ii] = order[(order.tree == i) & (order["used"] == 0)].par.iloc[0]
order.loc[(order["tree"] == i) & (order["used"] == 0), "used"] = 1
s = k1[-1]
for j in list(range(0, i))[::-1]:
orde = order[(order.tree == j) & (order["used"] == 0)]
for k in range(len(orde)):
arr = np.array(orde.node.iloc[k][:2]).astype(int)
if np.isin(s, arr) == True:
inde = orde.iloc[k].name
ai[j, ii] = arr[arr != s][0]
ci[j, ii] = order["cop"][inde]
pi[j, ii] = order["par"][inde]
order["used"][inde] = 1
ai[0, dimen - 1] = ai[0, dimen - 2]
return ai, pi, ci
# %%
[docs]
def fit_conditionalvine(u1, vint, copsi, vine="R", condition=1, printing=True):
"""
Fit a regular vine copula which allows for a conditional sample of a variable of interest.
Arguments:
*u1* : the data, provided as a numpy array where each column contains a seperate variable (eg. u1,u2,...,un), which have already been transferred to standard uniform margins (0<= u <= 1)
*vint* : the variables of interest, provided as an integere or list that refers to the variables column numbers in u1, where the first column is 0 and the second column is 1, etc.
*copsi* : A list of integers referring to the copulae of interest for which the fit has to be evauluated in the vine copula. eg. a list of [1, 10] refers to the Gaussian and Frank copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
*vine* : The type of vine copula that needs to be fit, either 'R', 'D', or 'C'
*condition* : condition = 1 indicates that vint needs to be the conditionalized variables (at the end of the sampling order), condition = 2 indicates that vint need to be the conditioning variables (at the start of the sampling order)
*printing*: True if the fitted copula should be printed and False if not
Returns:
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*p* : Parameters of the bivariate copulae provided as a triangular matrix.
*c* : The types of the bivariate copulae provided as a triangular matrix, composed of integers referring to the copulae with the best fit. eg. a 1 refers to the gaussian copula (see `Table 1 <https://vinecopulas.readthedocs.io/en/latest/vinecopulas.html#Fitting-a-Vine-Copula>`__).
"""
# Reference: Dißmann et al. 2013 adapted
dimen = u1.shape[1] # number of variables (number of columns)
if condition == 1: # if vint is the conditionalized variable
X = vint
try: # if X is an integer put it in a list
len(X)
except:
X = [X]
Y = list(range(dimen))
for i in X:
Y.remove(i) # Define Y
elif condition == 2: # if vint is the conditioning variable
Y = vint
try: # if Y is an integer put it in a list
len(Y)
except:
Y = [Y]
X = list(range(dimen))
for i in Y:
X.remove(i) # Define X
v1 = [] # list for variable 1
v2 = [] # list for variable 2
tauabs = [] # list for the absolute kendal tau between v1 and v2
for i in range(dimen - 1):
for j in range(i + 1, dimen):
v1.append(int(i)) # add variable to v1
v2.append(int(j)) # add variable to v2
tauabs.append(
abs(st.kendalltau(u1[:, i], u1[:, j])[0])
) # calculate the absolute kendall tau between v1 and v2 and add it to the taubs list
order1 = pd.DataFrame(
{"v1": v1, "v2": v2, "tauabs": tauabs}
) # put the v1, v2, and tauabs list into a dataframe for the first tree
order1 = order1.sort_values(by="tauabs", ascending=False).reset_index(
drop=True
) # sort this dataframe from highest to lowest tauabs
Y2 = Y.copy() # create copies of Y
X2 = X.copy() # create copies of X
if vine == "R":
# first find the pairs with the highest kendall tau using variables in Y
if len(Y) > 1:
for j in range(len(Y) - 1):
# define all rows in order1 where only variables in Y are included
for z in range(j + 1, len(Y)):
if z == 1:
l = ((order1.v1 == Y[j]) & (order1.v2 == Y[z])) | (
(order1.v1 == Y[z]) & (order1.v2 == Y[j])
)
else:
l = (
l
| ((order1.v1 == Y[j]) & (order1.v2 == Y[z]))
| ((order1.v1 == Y[z]) & (order1.v2 == Y[j]))
)
inde = [] # list to put used rows of order1 in
l = np.where(l == True)[0]
# loop through the rows with variables in Y
for k in l:
if k == l[0]: # add first one to the tree
order2 = order1.loc[k].to_frame().T
inde.append(k) # add used rows to inde
if k in inde: # check if row has not been added before
continue
lst = (
list(order2.v2[order2.v1 == order1.v2[k]])
+ list(order2.v1[order2.v2 == order1.v2[k]])
+ list(order2.v2[order2.v1 == order1.v1[k]])
+ list(order2.v1[order2.v2 == order1.v1[k]])
)
l1 = list(order2.v2[order2.v1 == order1.v2[k]]) + list(
order2.v1[order2.v2 == order1.v2[k]]
)
l2 = list(order2.v2[order2.v1 == order1.v1[k]]) + list(
order2.v1[order2.v2 == order1.v1[k]]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(order2.v2[order2.v1 == j]) + list(
order2.v1[order2.v2 == j]
)
try:
ln.remove(order1.v2[k])
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(order2.v2[order2.v1 == j]) + list(
order2.v1[order2.v2 == j]
)
try:
ln.remove(order1.v1[k])
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
if len(lst) == len(set(lst)):
order2 = pd.concat(
[order2, order1.loc[k].to_frame().T], ignore_index=True
)
inde.append(k)
if (
len(order2) == len(Y) - 1
): # end loop when desired number of rows (number of variables in Y - 1) has been met
break
else: # if there is only one value in Y
for z in X:
# define all rows in order1 where variables in X are connected to variabels in Y
if z == X[0]:
l = ((order1.v1 == Y[0]) & (order1.v2 == z)) | (
(order1.v1 == z) & (order1.v2 == Y[0])
)
else:
l = (
l
| ((order1.v1 == Y[0]) & (order1.v2 == z))
| ((order1.v1 == z) & (order1.v2 == Y[0]))
)
inde = [] # list to put used rows of order1 in
l = np.where(l == True)[0]
order2 = order1.loc[l[0]].to_frame().T # add the first row to order 2
inde.append(l[0]) # add used rows to inde
# remove used variables in X from X and add them to Y
if order2.v1.iloc[-1] in X2:
Y2.append(int(order2.v1.iloc[-1]))
X2.remove(int(order2.v1.iloc[-1]))
else:
Y2.append(int(order2.v2.iloc[-1]))
X2.remove(int(order2.v2.iloc[-1]))
while len(order2) < (dimen - 1):
# Here we ensure that the last remaining X variable will be connected either to another original X variable, or a variable that is connected to an original X value
if len(order2) == dimen - 2 and len(X) > 1:
X3 = X.copy()
X3.remove(X2[0])
Y3 = []
for i in X3:
Y3 = (
Y3
+ list(order2.v1[order2.v2 == i])
+ list(order2.v2[order2.v1 == i])
+ [i]
)
Y3 = list(np.unique(Y3))
for j in Y3:
for z in X2:
if j == Y3[0] and z == X2[0]:
l = ((order1.v1 == j) & (order1.v2 == z)) | (
(order1.v1 == z) & (order1.v2 == j)
)
else:
l = (
l
| ((order1.v1 == j) & (order1.v2 == z))
| ((order1.v1 == z) & (order1.v2 == j))
)
l = np.where(l == True)[0]
else:
# define all rows in order1 where variables in X are connected to variabels in Y
for j in Y2:
for z in X2:
if j == Y2[0] and z == X2[0]:
l = ((order1.v1 == j) & (order1.v2 == z)) | (
(order1.v1 == z) & (order1.v2 == j)
)
else:
l = (
l
| ((order1.v1 == j) & (order1.v2 == z))
| ((order1.v1 == z) & (order1.v2 == j))
)
l = np.where(l == True)[0]
for k in l:
lst = (
list(order2.v2[order2.v1 == order1.v2[k]])
+ list(order2.v1[order2.v2 == order1.v2[k]])
+ list(order2.v2[order2.v1 == order1.v1[k]])
+ list(order2.v1[order2.v2 == order1.v1[k]])
)
l1 = list(order2.v2[order2.v1 == order1.v2[k]]) + list(
order2.v1[order2.v2 == order1.v2[k]]
)
l2 = list(order2.v2[order2.v1 == order1.v1[k]]) + list(
order2.v1[order2.v2 == order1.v1[k]]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(order2.v2[order2.v1 == j]) + list(
order2.v1[order2.v2 == j]
)
try:
ln.remove(order1.v2[k])
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(order2.v2[order2.v1 == j]) + list(
order2.v1[order2.v2 == j]
)
try:
ln.remove(order1.v1[k])
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1):
# continue
if order1.v1[k] in X and order1.v2[k] in X:
skip = False
if order1.v2[k] in Y2:
numapear = sum(
(
(order2["v1"].apply(lambda x: order1.v2[k] == x))
| (order2["v2"].apply(lambda x: order1.v2[k] == x))
)
)
for i in Y:
if numapear == sum(
(
(order2["v1"].apply(lambda x: i == x))
| (order2["v2"].apply(lambda x: i == x))
)
):
skip = True
break
elif order1.v1[k] in Y2:
numapear = sum(
(
(order2["v1"].apply(lambda x: order1.v1[k] == x))
| (order2["v2"].apply(lambda x: order1.v1[k] == x))
)
)
for i in Y:
if numapear == sum(
(
(order2["v1"].apply(lambda x: i == x))
| (order2["v2"].apply(lambda x: i == x))
)
):
skip = True
break
if skip == True:
continue
if len(lst) == len(set(lst)):
order2 = pd.concat(
[order2, order1.loc[k].to_frame().T], ignore_index=True
)
inde.append(k) # add used rows to inde
# remove used variables in X from X and add them to Y
if order2.v1.iloc[-1] in X2:
Y2.append(int(order2.v1.iloc[-1]))
X2.remove(int(order2.v1.iloc[-1]))
else:
Y2.append(int(order2.v2.iloc[-1]))
X2.remove(int(order2.v2.iloc[-1]))
break
elif vine == "D":
if len(Y) > 1:
inde = [] # list to put used rows of order1 in
for j in range(len(Y) - 1):
# define all rows in order1 where only variables in Y are included
for z in range(j + 1, len(Y)):
if z == 1:
l = ((order1.v1 == Y[j]) & (order1.v2 == Y[z])) | (
(order1.v1 == Y[z]) & (order1.v2 == Y[j])
)
else:
l = (
l
| ((order1.v1 == Y[j]) & (order1.v2 == Y[z]))
| ((order1.v1 == Y[z]) & (order1.v2 == Y[j]))
)
l2 = list(order1[~l].index)
l = np.where(l == True)[0]
order2 = order1.loc[l[0]].to_frame().T.reset_index(drop=True)
vi = order2.v1[0] # select the first variable included in this row
vj = order2.v2[0] # select the second variable included in this row
while len(order2) < len(Y) - 1:
for i in l:
if (
order1.v1[i] == vj
or order1.v2[i] == vi
or order1.v1[i] == vi
or order1.v2[i] == vj
): # find the rows which include at least one of the two variables from the previous edge
if (
order1.v1[i] in inde or order1.v2[i] in inde
): # check if variables have already been connected to other variables twice
continue
if (order1.v1[i] == vi and order1.v2[i] == vj) or (
order1.v1[i] == vj and order1.v2[i] == vi
): # skip rows that have already been used
continue
else:
order2 = pd.concat(
[order2, order1.loc[i].to_frame().T], ignore_index=True
) # if conditions are met, add this to the dataframe of the first tree
if (
order1.v1[i] == vj or order1.v2[i] == vj
): # check which value has been used twice already
inde.append(vj) # add this value to inde
if order1.v1[i] == vj:
vj = order1.v2[i] # select the new edge to connect to
else:
vj = order1.v1[i] # select the new edge to connect to
break
elif (
order1.v1[i] == vi or order1.v2[i] == vi
): # check which value has been used twice already
inde.append(vi) # add this value to inde
if order1.v1[i] == vi:
vi = order1.v2[i] # select the new edge to connect to
else:
vi = order1.v1[i] # select the new edge to connect to
break
while len(order2) < u1.shape[1] -1:
for i in l2:
if (
order1.v1[i] == vj
or order1.v2[i] == vi
or order1.v1[i] == vi
or order1.v2[i] == vj
): # find the rows which include at least one of the two variables from the previous edge
if (
order1.v1[i] in inde or order1.v2[i] in inde
): # check if variables have already been connected to other variables twice
continue
if (order1.v1[i] == vi and order1.v2[i] == vj) or (
order1.v1[i] == vj and order1.v2[i] == vi
): # skip rows that have already been used
continue
else:
order2 = pd.concat(
[order2, order1.loc[i].to_frame().T], ignore_index=True
) # if conditions are met, add this to the dataframe of the first tree
if (
order1.v1[i] == vj or order1.v2[i] == vj
): # check which value has been used twice already
inde.append(vj) # add this value to inde
if order1.v1[i] == vj:
vj = order1.v2[i] # select the new edge to connect to
else:
vj = order1.v1[i] # select the new edge to connect to
elif (
order1.v1[i] == vi or order1.v2[i] == vi
): # check which value has been used twice already
inde.append(vi) # add this value to inde
if order1.v1[i] == vi:
vi = order1.v2[i] # select the new edge to connect to
else:
vi = order1.v1[i] # select the new edge to connect to
else:
inde = [] # lsit to put used variables in
for j in range(dimen - 1):
if j == 0:
order2 = order1.head(
1
) # loop through all the rows to include those pairs with the highest ktau first
vi = order2.v1[0] # select the first variable included in this row
vj = order2.v2[0] # select the second variable included in this row
else:
for i in range(len(order1)):
if (
order1.v1[i] == vj
or order1.v2[i] == vi
or order1.v1[i] == vi
or order1.v2[i] == vj
): # find the rows which include at least one of the two variables from the previous edge
if (
order1.v1[i] in inde or order1.v2[i] in inde
): # check if variables have already been connected to other variables twice
continue
if (order1.v1[i] == vi and order1.v2[i] == vj) or (
order1.v1[i] == vj and order1.v2[i] == vi
): # skip rows that have already been used
continue
else:
order2 = pd.concat(
[order2, order1.loc[i].to_frame().T],
ignore_index=True,
) # if conditions are met, add this to the dataframe of the first tree
if (
order1.v1[i] == vj or order1.v2[i] == vj
): # check which value has been used twice already
inde.append(vj) # add this value to inde
if order1.v1[i] == vj:
vj = order1.v2[
i
] # select the new edge to connect to
else:
vj = order1.v1[
i
] # select the new edge to connect to
elif (
order1.v1[i] == vi or order1.v2[i] == vi
): # check which value has been used twice already
inde.append(vi) # add this value to inde
if order1.v1[i] == vi:
vi = order1.v2[
i
] # select the new edge to connect to
else:
vi = order1.v1[
i
] # select the new edge to connect to
elif vine == "C":
taus = [] # list to put the sum of all tauabs in for a specific variable
for i in Y:
taus.append(
sum(order1[(order1.v1 == i) | (order1.v2 == i)].tauabs)
) # calculate the sum of all taus for a specific variable
i = Y[
np.where(np.array(taus) == max(taus))[0][0]
] # find where the sum of the taus is the highest to find the vairable to place in the center of the first tree
order2 = order1[(order1.v1 == i) | (order1.v2 == i)].reset_index(
drop=True
) # select the rows that include this variable
order2["both_in_Y"] = order2.apply(
lambda row: (row["v1"] in Y) and (row["v2"] in Y), axis=1
) # find the columns with values in Y
order2 = order2.sort_values(by="both_in_Y", ascending=False).reset_index(
drop=True
) # Sort the DataFrame by the 'both_in_Y' column
order2 = order2.drop(columns=["both_in_Y"]) # delete this column
order1 = order2 # make order1 == the first tree
del order2
rhos = [] # list for the rhos
node = [] # list for the nodes
cops = [] # list for the copulas
v1_1 = [] # list for the 1st nodes
v2_1 = [] # list for the 2nd nodes
aics = [] # list for the AIC's
for i in range(len(order1)):
v1i = int(order1.v1[i]) # first node
v2i = int(order1.v2[i]) # second node
u3 = np.vstack(
(u1[:, v1i], u1[:, v2i])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
node.append([v1i, v2i]) # create final node
v1_1.append(u1[:, v1i]) # add array of first node
v2_1.append(u1[:, v2i]) # add array of second node
v1_1 = np.array(v1_1).T # v1
v2_1 = np.array(v2_1).T # v2
# add information to dataframe of the first tree
order1["rhos"] = rhos
order1["node"] = node
order1["tree"] = 0
order1["cop"] = cops
order1["AIC"] = aics
# set up variables for the second tree
v1 = []
v2 = []
ktau = []
rhos = []
node = []
cops = []
v1_k = []
v2_k = []
aics = []
for i in range(
len(order1) - 1
): # loop through the first tree to identify all possible combination of nodes
v1i = int(order1.v1[i]) # parent node 1 from nodei in tree
v2i = int(order1.v2[i]) # parent node 2 from nodei in tree
copi = int(order1.cop[i]) # copula of nodei
pari = order1.rhos[i] # parameters of nodei
for j in (
np.where(
np.array([item == v1i for item in list(order1.v1[i + 1 :])])
| np.array([item == v1i for item in list(order1.v2[i + 1 :])])
| np.array([item == v2i for item in list(order1.v1[i + 1 :])])
| np.array([item == v2i for item in list(order1.v2[i + 1 :])])
)[0]
+ i
+ 1
): # see if possible connection between nodei and nodej
v1j = int(order1.v1[j]) # parent node 1 from nodej in tree
v2j = int(order1.v2[j]) # parent node 2 from nodej in tree
copj = int(order1.cop[j]) # copula of nodei
parj = order1.rhos[j] # parameters of nodei
v1.append(order1.node[i]) # parent node for next tree
v2.append(order1.node[j]) # parent node for next tree
lst = order1.node[i] + order1.node[j] # list of all variables in node
s = max(
set(lst), key=lst.count
) # variable that is common in both parent nodes
# parent node values
ui1 = v1_1[:, i]
ui2 = v2_1[:, i]
uj1 = v1_1[:, j]
uj2 = v2_1[:, j]
# defining which parent node the conditional CDF needs to be based on
if v1i == s:
uni = 1
vi1 = v2i
else:
uni = 2
vi1 = v1i
if v1j == s:
unj = 1
vj1 = v2j
else:
unj = 2
vj1 = v1j
# calculate the conditional CDF
v1igs = hfunc(copi, ui1, ui2, pari, un=uni)
v2jgs = hfunc(copj, uj1, uj2, parj, un=unj)
ktau.append(
abs(st.kendalltau(v1igs, v2jgs)[0])
) # calculate the absolute kendall tau between v1igs and v2jgs, add it to the ktau list
# add the parent node values
v1_k.append(v1igs)
v2_k.append(v2jgs)
# format the final node
node.append([vi1, vj1, "g", s])
k = 2 # second tree
orderk = pd.DataFrame(
{"v1": v1, "v2": v2, "tauabs": ktau, "node": node}
) # put the v1, v2, tauabs, and the node in list into a dataframe for tree k
if vine == "R":
if len(orderk) > dimen - k:
sorder2 = order1.node # get all the unique parent nodes
# divide the unique nodes in Y and X
if len(Y) - 1 > 0:
Y2 = list(sorder2[: len(Y) - 1].reset_index(drop=True))
X2 = list(sorder2[len(Y) - 1 :].reset_index(drop=True))
else:
Y2 = list(sorder2[:1].reset_index(drop=True))
X2 = list(sorder2[1:].reset_index(drop=True))
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
indexi = list(
orderk.index
) # create a list the original order of the dataframe prior to sorting
orderk = orderk.reset_index(drop=True) # reset the index
inde = [] # list to put used rows of orderk i
inde2 = (
[]
) # list to put used rows of orderk in based on their oroginal position in the dataframe
Y3 = Y2.copy()
X3 = X2.copy()
if len(Y2) == 1: # if there is only one node in Y
# define all rows in orderk where the nodes in X are connected to nodes in Y
for j in Y2:
for z in X2:
if j == Y2[0] and z == X2[0]:
l = (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
) | (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
else:
l = (
l
| (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
)
| (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
)
l = np.where(l == True)[0]
order = orderk.loc[l[0]].to_frame().T
inde.append(l[0])
inde2.append(indexi[l[0]]) # add used rows to inde (original row value)
# remove used nodes in X from X and add them to Y
if order.v1.iloc[-1] in X3:
X3.remove(order.v1.iloc[-1])
Y3.append(order.v1.iloc[-1])
else:
X3.remove(order.v2.iloc[-1])
Y3.append(order.v2.iloc[-1])
else:
# first find the pairs with the highest kendall tau using nodes in Y
for j in range(len(Y2) - 1):
# define all rows in orderk where only nodes in Y are included
for z in range(j + 1, len(Y2)):
if z == 1:
l = (
(orderk["v1"].apply(lambda x: Y2[j] == x))
& (orderk["v2"].apply(lambda x: Y2[z] == x))
) | (
(orderk["v1"].apply(lambda x: Y2[z] == x))
& (orderk["v2"].apply(lambda x: Y2[j] == x))
)
else:
l = (
l
| (
(orderk["v1"].apply(lambda x: Y2[j] == x))
& (orderk["v2"].apply(lambda x: Y2[z] == x))
)
| (
(orderk["v1"].apply(lambda x: Y2[z] == x))
& (orderk["v2"].apply(lambda x: Y2[j] == x))
)
)
l = np.where(l == True)[0]
# loop through the rows with nodes in Y
for k in l:
if k == l[0]: # add first one to the tree
order = orderk.loc[k].to_frame().T
inde.append(k)
inde2.append(
indexi[k]
) # add used rows to inde (original row value)
if k in inde: # check if row has not been added before
continue
lst = (
list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[k])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[k])
]
)
+ list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[k])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[k])
]
)
)
l1 = list(
order.v2.astype(str)[order.v1.astype(str) == str(orderk.v1[k])]
) + list(
order.v1.astype(str)[order.v2.astype(str) == str(orderk.v1[k])]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(order.v1.astype(str)[order.v2.astype(str) == j])
try:
ln.remove(str(orderk.v1[k]))
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
l2 = list(
order.v2.astype(str)[order.v1.astype(str) == str(orderk.v2[k])]
) + list(
order.v1.astype(str)[order.v2.astype(str) == str(orderk.v2[k])]
)
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(order.v1.astype(str)[order.v2.astype(str) == j])
try:
ln.remove(str(orderk.v2[k]))
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1): # ensure that there are no groups of variables that all have a connection with one another
# continue
if len(lst) == len(set(lst)):
order = pd.concat(
[order, orderk.loc[k].to_frame().T], ignore_index=True
)
inde.append(k)
inde2.append(indexi[k])
if (
len(order) == len(Y2) - 1
): # end loop when desired number of rows (number of variables in Y - 1) has been met
break
while len(order) < (dimen - 2):
# define all rows in orderk where nodes in X are connected to nodes in Y
for j in Y3:
for z in X3:
if j == Y3[0] and z == X3[0]:
l = (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
) | (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
else:
l = (
l
| (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
)
| (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
)
l = np.where(l == True)[0]
for k in l:
if k in inde:
continue
lst = (
list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[k])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[k])
]
)
+ list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[k])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[k])
]
)
)
l1 = list(
order.v2.astype(str)[order.v1.astype(str) == str(orderk.v1[k])]
) + list(
order.v1.astype(str)[order.v2.astype(str) == str(orderk.v1[k])]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(order.v1.astype(str)[order.v2.astype(str) == j])
try:
ln.remove(str(orderk.v1[k]))
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
l2 = list(
order.v2.astype(str)[order.v1.astype(str) == str(orderk.v2[k])]
) + list(
order.v1.astype(str)[order.v2.astype(str) == str(orderk.v2[k])]
)
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(order.v1.astype(str)[order.v2.astype(str) == j])
try:
ln.remove(str(orderk.v2[k]))
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1):
# continue
if len(lst) == len(set(lst)):
order = pd.concat(
[order, orderk.loc[k].to_frame().T], ignore_index=True
)
inde.append(k)
inde2.append(indexi[k])
# remove used variables in X from X and add them to Y
if order.v1.iloc[-1] in X3:
X3.remove(order.v1.iloc[-1])
Y3.append(order.v1.iloc[-1])
# X2.remove(int(order2.v1.iloc[-1]))
else:
X3.remove(order.v2.iloc[-1])
Y3.append(order.v2.iloc[-1])
break
orderk = order
k = 2
# orderk = orderk.sort_values(by='tauabs', ascending=False)
v1_k = np.array([v1_k[ind] for ind in inde]).T # sort array of first node
v2_k = np.array([v2_k[ind] for ind in inde]).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
else:
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array(
[v1_k[ind] for ind in orderk.index]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in orderk.index]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
elif vine == "D":
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array(
[v1_k[ind] for ind in orderk.index]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in orderk.index]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
elif vine == "C":
if len(orderk) > dimen - k:
subnodes = order1[
["v1", "v2"]
].values.tolist() # see all the unique parent nodes
taus = [] # list to put the sum of all tauabs in for a specific nodes
for i in range(len(Y) - 1):
orderksub = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
]
taus.append(
sum(orderksub.tauabs)
) # calculate the sum of all taus for a specific variable
i = np.where(np.array(taus) == max(taus))[0][
0
] # find where the sum of the taus is the highest to find the vairable to place in the center of the first tree
orderk = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
] # select rows where this node is included
inde = list(orderk.index)
v1_k = np.array([v1_k[ind] for ind in inde]).T # sort array of first node
v2_k = np.array([v2_k[ind] for ind in inde]).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
else:
orderk = orderk.sort_values(by="tauabs", ascending=False)
v1_k = np.array([v1_k[ind] for ind in orderk.index]).T
v2_k = np.array([v2_k[ind] for ind in orderk.index]).T
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
order2 = orderk.copy()
for i in range(len(order2)):
u3 = np.vstack(
(v1_2[:, i], v2_2[:, i])
).T # stacking the combination to fit copula to
cop, rho, aic = bestcop(copsi, u3) # fit the best copula
aics.append(aic) # add AIC to aics
rhos.append(rho) # add parameters to rhos
cops.append(cop) # add copula to cops
# add information to dataframe of tree k
order2["rhos"] = rhos
order2["cop"] = cops
order2["AIC"] = aics
# loop through remaining trees if there are more than 3 variables
if dimen > 3:
for k in range(3, dimen):
order = locals()[
"order" + str(k - 1)
].copy() # select the struture of the previous tree
v1s = locals()[
"v1_" + str(k - 1)
].copy() # select the first nodes of the previous tree
v2s = locals()[
"v2_" + str(k - 1)
].copy() # select the second nodes of the previous tree
# create lists for the variables
v1_k = []
v2_k = []
v1 = []
v2 = []
ktau = []
rhos = []
cops = []
node = []
lk = []
aics = []
rk = []
for i in range(len(order) - 1):
v1i = order.v1[i].copy() # parent node 1 from nodei in tree
v2i = order.v2[i].copy() # parent node 2 from nodei in tree
copi = int(order.cop[i]) # copula of nodei
pari = order.rhos[i] # parameters of nodei
for j in (
np.where(
np.array([item == v1i for item in list(order.v1[i + 1 :])])
| np.array([item == v1i for item in list(order.v2[i + 1 :])])
| np.array([item == v2i for item in list(order.v1[i + 1 :])])
| np.array([item == v2i for item in list(order.v2[i + 1 :])])
)[0]
+ i
+ 1
): # see if possible connection between nodei and nodej
v1i = order.v1[i].copy() # parent node 1 from nodei in tree
v2i = order.v2[i].copy() # parent node 2 from nodei in tree
copj = int(order.cop[j]) # copula of nodei
parj = order.rhos[j] # parameters of nodei
nodei = order.node[i] # nodei
nodej = order.node[j] # nodej
v1j = order.v1[j].copy() # parent node 1 from nodej in tree
v2j = order.v2[j].copy() # parent node 2 from nodej in tree
v1.append(nodei) # new parent nodes 1
v2.append(nodej) # new parent nodes 2
n = 2
ri = nodei[n + 1 :] # select the values on the right side of node i
rj = nodej[n + 1 :] # select the values on the right side of node i
if "g" in v1j:
v1j.remove("g")
v2j.remove("g")
v1i.remove("g")
v2i.remove("g")
# define left and right side of the node in tree k
if rj == ri:
if len(v1j) == 2:
lst = nodei[:n] + nodej[:n]
r3 = [max(set(lst), key=lst.count)]
li = [value for value in nodei[:n] if value not in r3]
lj = [value for value in nodej[:n] if value not in r3]
r = list(np.unique(ri + r3))
else:
lst = v1i[:n] + v2i[:n] + v1j[:n] + v2j[:n]
for s in ri:
lst = [x for x in lst if x != s]
r3 = [max(set(lst), key=lst.count)]
li = [value for value in nodei[:n] if value not in r3]
lj = [value for value in nodej[:n] if value not in r3]
r = list(np.unique(ri + r3))
l = list(np.unique(li + lj))
else:
r = list(np.unique(ri + rj))
li = [value for value in nodei[:n] if value not in rj]
lj = [value for value in nodej[:n] if value not in ri]
if li == lj:
lst = v1i[:n] + v2i[:n] + v1j[:n] + v2j[:n]
lst = [x for x in lst if x != li[0]]
l3 = [min(set(lst), key=lst.count)]
l = list(np.unique(li + l3))
else:
l = list(np.unique(li + lj))
# select the parent node values
ui1 = v1s[:, i]
ui2 = v2s[:, i]
uj1 = v1s[:, j]
uj2 = v2s[:, j]
if set(r).issubset(set(v1i)):
uni = 1
elif set(r).issubset(set(v2i)):
uni = 2
elif set(rj).issubset(set(v2i[1:])):
uni = 2
elif set(rj).issubset(set(v1i[1:])):
uni = 1
if set(r).issubset(set(v1j)):
unj = 1
elif set(r).issubset(set(v2j)):
unj = 2
elif set(ri).issubset(set(v2j[1:])):
unj = 2
elif set(ri).issubset(set(v1j[1:])):
unj = 1
# calculate the conditional CDF
v1igs = hfunc(copi, ui1, ui2, pari, un=uni)
v2jgs = hfunc(copj, uj1, uj2, parj, un=unj)
ktau.append(
abs(st.kendalltau(v1igs, v2jgs)[0])
) # calculate the absolute kendall tau between v1igs and v2jgs, add it to the ktau list
# add the parent node values
v1_k.append(v1igs)
v2_k.append(v2jgs)
del uj1, ui1, ui2, uj2
node.append(l + ["g"] + r) # set node name
lk.append(l) # add left side of node
rk.append(r) # add rigth side of node
orderk = pd.DataFrame(
{"v1": v1, "v2": v2, "tauabs": ktau, "node": node, "l": lk, "r": rk}
) # put information in dataframe for tree k
if vine == "R":
if len(orderk) > dimen - k:
sorder2 = order.node # get all the unique parent nodes
# divide the unique nodes in Y and X
if (len(Y) + 1 - k) > 1:
Y2 = list(sorder2[: len(Y) - k].reset_index(drop=True))
X2 = list(sorder2[len(Y) - k :].reset_index(drop=True))
else:
Y2 = list(sorder2[:1].reset_index(drop=True))
X2 = list(sorder2[1:].reset_index(drop=True))
Y3 = Y2.copy()
X3 = X2.copy()
orderk = orderk.sort_values(by="tauabs", ascending=False)
indexi = list(orderk.index)
orderk = orderk.reset_index(drop=True)
inde = []
inde2 = []
if len(Y2) > 1:
for j in range(len(Y2) - 1):
for z in range(j + 1, len(Y2)):
if z == 1:
l = (
(orderk["v1"].apply(lambda x: Y2[j] == x))
& (orderk["v2"].apply(lambda x: Y2[z] == x))
) | (
(orderk["v1"].apply(lambda x: Y2[z] == x))
& (orderk["v2"].apply(lambda x: Y2[j] == x))
)
else:
l = (
l
| (
(orderk["v1"].apply(lambda x: Y2[j] == x))
& (orderk["v2"].apply(lambda x: Y2[z] == x))
)
| (
(orderk["v1"].apply(lambda x: Y2[z] == x))
& (orderk["v2"].apply(lambda x: Y2[j] == x))
)
)
l = np.where(l == True)[0]
for z in l:
if z == l[0]:
order = orderk.loc[z].to_frame().T
inde.append(z)
inde2.append(indexi[z])
if z in inde:
continue
lst = (
list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[z])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[z])
]
)
+ list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[z])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[z])
]
)
)
l1 = list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[z])
]
) + list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[z])
]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(
order.v1.astype(str)[order.v2.astype(str) == j]
)
try:
ln.remove(str(orderk.v1[z]))
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
l2 = list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[z])
]
) + list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[z])
]
)
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(
order.v1.astype(str)[order.v2.astype(str) == j]
)
try:
ln.remove(str(orderk.v2[z]))
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1):
# continue
if len(lst) == len(set(lst)):
order = pd.concat(
[order, orderk.loc[z].to_frame().T],
ignore_index=True,
)
inde.append(z)
inde2.append(indexi[z])
break
else:
for j in Y2:
for z in X2:
if j == Y2[0] and z == X2[0]:
l = (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
) | (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
else:
l = (
l
| (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
)
| (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
)
l = np.where(l == True)[0]
order = orderk.loc[l[0]].to_frame().T
inde.append(l[0])
inde2.append(indexi[l[0]])
if order.v1.iloc[-1] in X3:
X3.remove(order.v1.iloc[-1])
Y3.append(order.v1.iloc[-1])
# X2.remove(int(order2.v1.iloc[-1]))
else:
X3.remove(order.v2.iloc[-1])
Y3.append(order.v2.iloc[-1])
while len(order) < (dimen - k):
for j in Y3:
for z in X3:
if j == Y3[0] and z == X3[0]:
l = (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
) | (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
else:
l = (
l
| (
(orderk["v1"].apply(lambda x: j == x))
& (orderk["v2"].apply(lambda x: z == x))
)
| (
(orderk["v1"].apply(lambda x: z == x))
& (orderk["v2"].apply(lambda x: j == x))
)
)
l = np.where(l == True)[0]
for z in l:
if z in inde:
continue
lst = (
list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[z])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[z])
]
)
+ list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[z])
]
)
+ list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[z])
]
)
)
l1 = list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v1[z])
]
) + list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v1[z])
]
)
lk = l1.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(
order.v1.astype(str)[order.v2.astype(str) == j]
)
try:
ln.remove(str(orderk.v1[z]))
except:
pass
for s in l1:
try:
ln.remove(s)
except:
pass
l1 = l1 + ln
lk = lk + ln
l2 = list(
order.v2.astype(str)[
order.v1.astype(str) == str(orderk.v2[z])
]
) + list(
order.v1.astype(str)[
order.v2.astype(str) == str(orderk.v2[z])
]
)
lk = l2.copy()
while len(lk) > 0:
lk2 = lk.copy()
lk = []
for j in lk2:
ln = list(
order.v2.astype(str)[order.v1.astype(str) == j]
) + list(
order.v1.astype(str)[order.v2.astype(str) == j]
)
try:
ln.remove(str(orderk.v2[z]))
except:
pass
for s in l2:
try:
ln.remove(s)
except:
pass
l2 = l2 + ln
lk = lk + ln
skip = False
for val in l1:
if val in l2:
skip = True
break
if skip == True:
continue
# if (len(l1) > 1 and len(l2) >0) or (len(l1) > 0 and len(l2) >1):
# continue
# if orderk.iloc[z].v1 in X2 and orderk.iloc[z].v2 in X2:
# skip = False
# if orderk.iloc[z].v2 in Y3:
# numapear = sum(((order['v1'].apply(lambda x: orderk.iloc[z].v2== x )) | (order['v2'].apply(lambda x: orderk.iloc[z].v2== x))))
# for i in Y2:
# if numapear == sum(((order['v1'].apply(lambda x: i== x )) | (order['v2'].apply(lambda x: i== x)))):
# skip = True
# break
# elif orderk.iloc[z].v1 in Y3:
# numapear = sum(((order['v1'].apply(lambda x: orderk.iloc[z].v1== x )) | (order['v2'].apply(lambda x: orderk.iloc[z].v1== x))))
# for i in Y2:
# if numapear == sum(((order['v1'].apply(lambda x: i== x )) | (order['v2'].apply(lambda x: i== x)))):
# skip = True
# break
# if skip == True:
# continue
if len(lst) == len(set(lst)):
order = pd.concat(
[order, orderk.loc[z].to_frame().T],
ignore_index=True,
)
inde.append(z)
inde2.append(indexi[z])
if order.v1.iloc[-1] in X3:
X3.remove(order.v1.iloc[-1])
Y3.append(order.v1.iloc[-1])
# X2.remove(int(order2.v1.iloc[-1]))
else:
X3.remove(order.v2.iloc[-1])
Y3.append(order.v2.iloc[-1])
break
# orderk = orderk.sort_values(by='tauabs', ascending=False)
orderk = order
v1_k = np.array([v1_k[ind] for ind in inde2]).T
v2_k = np.array([v2_k[ind] for ind in inde2]).T
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
else:
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array(
[v1_k[ind] for ind in orderk.index]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in orderk.index]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
elif vine == "D":
orderk = orderk.sort_values(
by="tauabs", ascending=False
) # sort this dataframe from highest to lowest tauabs
v1_k = np.array(
[v1_k[ind] for ind in orderk.index]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in orderk.index]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_2 = v1_k.copy()
v2_2 = v2_k.copy()
elif vine == "C":
if len(orderk) > dimen - k:
subnodes = list(order.node) # see all the unique parent nodes
taus = (
[]
) # list to put the sum of all tauabs in for a specific nodes
if (len(Y) + 1 - k) > 1:
for i in range(len(Y) + 1 - k):
orderksub = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
]
taus.append(
sum(orderksub.tauabs)
) # calculate the sum of all taus for a specific variable
i = np.where(np.array(taus) == max(taus))[0][
0
] # find where the sum of the taus is the highest to find the vairable to place in the center of the first tree
orderk = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[i]))
| (orderk["v2"].apply(lambda x: x == subnodes[i]))
] # select rows where this node is included
else:
orderk = orderk[
(orderk["v1"].apply(lambda x: x == subnodes[0]))
| (orderk["v2"].apply(lambda x: x == subnodes[0]))
]
inde = list(orderk.index)
v1_k = np.array(
[v1_k[ind] for ind in inde]
).T # sort array of first node
v2_k = np.array(
[v2_k[ind] for ind in inde]
).T # sort array of second node
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
v1_k = v1_k.copy()
v2_k = v2_k.copy()
else:
orderk = orderk.sort_values(by="tauabs", ascending=False)
v1_k = np.array([v1_k[ind] for ind in orderk.index]).T
v2_k = np.array([v2_k[ind] for ind in orderk.index]).T
orderk = orderk.reset_index(drop=True)
orderk["tree"] = k - 1
for j in range(len(orderk)):
u3 = np.vstack((v1_k[:, j], v2_k[:, j])).T
cop, rho, aic = bestcop(copsi, u3)
aics.append(aic)
rhos.append(rho)
cops.append(cop)
orderk["rhos"] = rhos
orderk["cop"] = cops
orderk["AIC"] = aics
locals()["v1_" + str(k)] = v1_k
locals()["v2_" + str(k)] = v2_k
locals()["order" + str(k)] = orderk
order = pd.DataFrame(columns=order1.columns)
for i in range(1, dimen):
order = pd.concat([order, locals()["order" + str(i)]]).reset_index(drop=True)
a = np.empty((dimen, dimen))
c = np.empty((dimen, dimen))
a[:] = np.nan
c[:] = np.nan
order["used"] = 0
combinations = list(product([True, False], repeat=dimen))
for i in list(range(dimen - 1))[::-1]:
k1 = sorted(
np.array(
order[(order.tree == i) & (order["used"] == 0)].node.iloc[0][:2]
).astype(int)
)[::-1]
order.loc[(order["tree"] == i) & (order["used"] == 0), "used"] = 1
t1 = i - 1
ii = dimen - 2 - i
a[i : dimen - ii, ii] = k1
s = k1[-1]
for j in list(range(0, i))[::-1]:
orde = order[(order.tree == j) & (order["used"] == 0)]
for k in range(len(orde)):
arr = np.array(orde.node.iloc[k][:2]).astype(int)
if np.isin(s, arr) == True:
inde = orde.iloc[k].name
a[j, ii] = arr[arr != s][0]
order["used"][inde] = 1
continue
a[0, dimen - 1] = a[0, dimen - 2]
orderk = pd.DataFrame(columns=order.columns)
p = np.empty((dimen, dimen))
p[:] = np.nan
p = p.astype(object)
for i in list(range(dimen - 1)):
orde = order[order.tree == i]
for k in list(range(dimen - 1 - i)):
ak = a[:, k]
akn = np.array([ak[-1 - k], ak[i]]).astype(int)
for j in range(len(orde)):
arr = np.array(orde.node.iloc[j][:2]).astype(int)
if sum(np.isin(akn, arr)) == 2:
orderj = order.loc[[orde.index[j]]]
p[i, k] = orderj.rhos.iloc[0]
c[i, k] = orderj.cop.iloc[0]
if i == 0:
orderj.node.iloc[0] = list(akn)
else:
orderj.node.iloc[0] = (
list(akn) + ["|"] + list((ak.astype(int)[:i])[::-1])
)
orderk = pd.concat([orderk, orderj]).reset_index(drop=True)
sorder = list(np.diag(a[::-1])[::-1])
if set(sorder[-len(X) :]) != set(X):
sorders = samplingorder(a)
if len(Y) >= len(X):
for sorder in sorders:
if set(sorder[-len(X) :]) == set(X):
break
else:
for sorder in sorders:
if set(sorder[: len(Y)]) == set(Y):
break
a, p, c = samplingmatrix(a, c, p, sorder)
if printing == True:
for i in list(range(0, dimen - 1)):
orde = orderk[orderk.tree == i].reset_index(drop=True)
print("** Tree: ", i + 1)
for j in range(len(orde)):
if i != 0:
nodej = ",".join(map(str, orde.node[j])).replace(",|,", "|")
else:
nodej = ",".join(map(str, orde.node[j]))
print(
nodej,
" ---> ",
copulas[int(orde.cop[j])],
": parameters = ",
orde.rhos[j],
)
return a, p, c
# %%
[docs]
def plotvine(a, plottitle=None, variables=None, savepath=None):
"""
Plots the vine structure
Arguments:
*a* : The vine tree structure provided as a triangular matrix, composed of integers. The integer refers to different variables depending on which column the variable was in u1, where the first column is 0 and the second column is 1, etc.
*pltotitle* : title of the plot
*savepath* : path to save the plot
Returns:
*plot*
"""
dimen = a.shape[0] # dimension of copula
order = pd.DataFrame(
columns=["node", "l", "r", "tree"]
) # dataframe with vine copula structure.
s = 0
for i in list(range(dimen - 1)):
for k in list(range(dimen - 1 - i)):
ak = a[:, k]
akn = np.array([ak[-1 - k], ak[i]]).astype(int)
if i == 0:
single_row_values = {
"node": list(akn),
"l": akn[0],
"r": akn[1],
"tree": i,
}
else:
single_row_values = {
"node": list(akn) + ["|"] + list((ak.astype(int)[:i])[::-1]),
"l": list(akn),
"r": list((ak.astype(int)[:i])[::-1]),
"tree": i,
}
order.loc[s] = single_row_values
s = s + 1
for t in list(range(dimen - 1)):
orderk = order[order.tree == t].reset_index(drop=True)
if t == 0:
orderk["v1"] = orderk.l
orderk["v2"] = orderk.r
rhos = []
v1_1 = []
v2_1 = []
cops = []
for j in range(len(orderk)):
v1i = int(orderk.v1[j]) # first node
v2i = int(orderk.v2[j]) # second node
locals()["order" + str(t + 1)] = orderk
else:
v1k = []
v2k = []
for j in range(len(orderk)):
orderk2 = order[order.tree == t - 1].reset_index(drop=True)
l = orderk.l[j] + orderk.r[j]
subnodes = []
for k in range(len(orderk2)):
subnodes.append(sum(1 for item in orderk2.node[k] if item in l))
subnodes = np.array(subnodes) == len(l) - 1
orderk2 = orderk2[subnodes].reset_index(drop=True)
v1k.append(orderk2.node[0])
v2k.append(orderk2.node[1])
orderk["v1"] = v1k
orderk["v2"] = v2k
locals()["order" + str(t + 1)] = orderk
n = dimen - 1
fig, axes = plt.subplots(dimen - 1, 1, figsize=(n * 2, n * 3)) # 5 rows, 1 column
leg_labels = {}
if variables != None:
for i in range(len(variables)):
leg_labels.update({i: variables[i]})
for t, ax in zip(
range(1, dimen), axes.flat
): # Iterate through subplots and loop indices
if t == 1:
orderk = locals()["order" + str(t)]
edges = list(orderk.node)
edges = [tuple(sublist) for sublist in edges]
edge_labels = {
edge: ",".join(map(str, orderk.node[i])) for i, edge in enumerate(edges)
}
elif t == 2:
orderk = locals()["order" + str(t)]
edges = [
(",".join(map(str, orderk.v1[i])), ",".join(map(str, orderk.v2[i])))
for i in range(len(orderk))
]
edge_labels = {
edge: ",".join(map(str, orderk.node[i])).replace(",|,", "|")
for i, edge in enumerate(edges)
}
else:
orderk = locals()["order" + str(t)]
edges = [
(
",".join(map(str, orderk.v1[i])).replace(",|,", "|"),
",".join(map(str, orderk.v2[i])).replace(",|,", "|"),
)
for i in range(len(orderk))
]
edge_labels = {
edge: ",".join(map(str, orderk.node[i])).replace(",|,", "|")
for i, edge in enumerate(edges)
}
G = nx.Graph() # Create graph.
G.add_edges_from(edges) # Add edges to the graph
pos = nx.spring_layout(G) # Position the nodes using the spring layout
d = nx.degree(G)
try:
sizes = len(edges[0][0]) * 400
except:
sizes = len(edges[0]) * 400
nx.draw_networkx_labels(
G, pos, ax=ax, bbox=dict(facecolor="skyblue"), font_size=13
) # Draw labels.
nx.draw_networkx_edges(
G,
pos,
edge_color="black",
ax=ax,
) # Draw black edges between nodes
nx.draw_networkx_edge_labels(
G, pos, edge_labels=edge_labels, font_color="black", ax=ax, font_size=12
) # Draw text labels above each edge
ax.axis("off") # Remove the box around the plot
ax.set_title(f"Tree {t}") # Set title for the subplot
# title
if plottitle != None:
fig.suptitle(plottitle, fontsize=16, y=0.93)
if variables != None:
plt.text(
0.9,
0.5,
"\n".join([f"{key} : {value}" for key, value in leg_labels.items()]),
transform=plt.gcf().transFigure,
fontsize=15,
verticalalignment="center",
)
# save
if savepath != None:
plt.savefig(savepath, dpi=300, bbox_inches="tight")
plt.show() # Show plot.