This R session will illustrate the need for heavy-tailed estimators when it comes to financial data (the R package fitHeavyTail
is specifically designed for that purpose).
(Useful R links: Cookbook R, Quick-R, R documentation, CRAN, METACRAN.)
Let’s load the S&P 500 index:
library(quantmod)
GSPC <- getSymbols("^GSPC", from = "2010-01-01", to = "2015-12-31", auto.assign = FALSE)
prices <- Cl(GSPC)
head(log(prices))
#> GSPC.Close
#> 2010-01-04 7.032615
#> 2010-01-05 7.035726
#> 2010-01-06 7.036272
#> 2010-01-07 7.040265
#> 2010-01-08 7.043142
#> 2010-01-11 7.044888
tail(log(prices))
#> GSPC.Close
#> 2015-12-22 7.620200
#> 2015-12-23 7.632542
#> 2015-12-24 7.630942
#> 2015-12-28 7.628761
#> 2015-12-29 7.639334
#> 2015-12-30 7.632091
Let’s do some plotting:
# plot log-prices
plot(log(prices), col = 'blue', lwd = 2, ylab = "log-price", main = "S&P 500 index")
# plot log-returns
R_daily <- diff(log(prices))[-1]
plot(R_daily, col = 'blue', lwd = 1, ylab = "log-return", main = "S&P 500 index")
Now, let’s see how heavy the tails are:
# histograms to illustrate heavy-tails and gain/loss asymmetry
h <- hist(as.vector(R_daily), breaks = 100, prob = TRUE, col = "lightgray",
xlab = "return", main = "Histogram of log-returns")
xfit <- seq(min(R_daily), max(R_daily), length = 100)
yfit <- dnorm(xfit, mean = mean(R_daily)+5e-4, sd = 0.55*sd(R_daily))
lines(xfit, yfit, col = "blue", lwd=2)
We can positively conclude that financial data is clearly heavy-tailed. Therefore, using traditional sample estimators is probably going to be pretty bad, which we will explore next.
Let’s warm up using the classical estimators for the mean and covariance matrix, namely the sample mean and the sample covariance matrix: \[ \begin{aligned} \boldsymbol{\mu} & = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t\\ \boldsymbol{\Sigma} & = \frac{1}{T-1}\sum_{t=1}^T(\mathbf{x}_t-\boldsymbol{\mu})(\mathbf{x}_t-\boldsymbol{\mu})^T. \end{aligned} \]
# generate Gaussian synthetic return data
library(mvtnorm)
set.seed(357)
N <- 200
T <- 1000
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvnorm(n = T, mean = mu, sigma = R_cov)
str(X)
#> num [1:1000, 1:200] 0.86 -3.206 2.803 -7.817 0.402 ...
# classical mean estimator (sample mean)
mu_classical <- colMeans(X)
norm(mu_classical - mu, "2")
#> [1] 1.655746
# classical covariance estimator (sample covariance matrix)
R_classical <- cov(X)
norm(R_classical - R_cov, "F")
#> [1] 97.09152
Now, let’s add some of outliers in the data and see how the performance degrades significantly.
library(ellipse)
set.seed(42)
# introduce outliers
num_ouliers <- round(0.05*T)
X_outliers <- X
t_indices <- sample(T, num_ouliers)
n_indices <- sample(N, num_ouliers)
ave_std <- sqrt(mean(diag(R_cov)))
X_outliers[cbind(t_indices, n_indices)] <- 10*ave_std*X_outliers[cbind(t_indices, n_indices)]
# scatter plot
i1 <- n_indices[1]
i2 <- n_indices[2]
plot(X[, i1], X[, i2],
main = "Scatter plot of Gaussian returns", xlab = "asset 1", ylab = "asset 2",
col = rgb(0, 100, 0, 100, maxColorValue = 255), pch = 16)
lines(ellipse(cov(X[, c(i1, i2)])), col = "black", lwd = 2)
lines(ellipse(cov(X_outliers[, c(i1, i2)])), col = "blue", lwd = 2)
plot(X_outliers[, i1], X_outliers[, i2],
main = "Scatter plot of Gaussian returns with outliers", xlab = "asset 1", ylab = "asset 2",
col = rgb(0, 100, 0, 100, maxColorValue = 255), pch = 16)
lines(ellipse(cov(X[, c(i1, i2)])), col = "black", lwd = 2)
lines(ellipse(cov(X_outliers[, c(i1, i2)])), col = "blue", lwd = 2)
Let’s now generate heavy-tailed synthetic data and see the performance of the classical sample estimators.
set.seed(42)
# generate heavy-tailed synthetic return data
nu <- 4 # degrees of freedom (measures how heavy the tail is)
N <- 200
T <- 1000
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)
# scatter plot
tmp <- sample(N, 2) #choose two assets randomly
i1 <- tmp[1]
i2 <- tmp[2]
plot(X[, i1], X[, i2],
main = "Scatter plot of heavy-tailed returns", xlab = "asset 1", ylab = "asset 2",
col = rgb(0, 100, 0, 100, maxColorValue = 255), pch = 16)
lines(ellipse(R_cov[c(i1,i2), c(i1,i2)]), col = "black", lwd = 2)
lines(ellipse(cov(X[, c(i1, i2)])), col = "blue", lwd = 2)
Let’s now try the Tyler estimator for heavy-tailed distributions. Assuming zero mean, the sample covariance matrix is \[ \boldsymbol{\Sigma} = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t\mathbf{x}_t^T \] whereas the Tyler estimator is the fixed-point equation: \[ \boldsymbol{\Sigma} = \frac{N}{T}\sum_{t=1}^T\frac{\mathbf{x}_t\mathbf{x}_t^T}{\mathbf{x}_t^T\boldsymbol{\Sigma}^{-1}\mathbf{x}_t}, \] which requires an iterative algorithm in practice.
# sample covariance matrix cov(X)
R_classical <- (1/T) * t(X) %*% X
# let's define a function for Tyler's estimator
estimateTyler <- function(X, verbose = FALSE) {
max_iter <- 100
error_th_Sigma <- 1e-3
#Gaussian initial point
Sigma <- cov(X)
Sigma <- Sigma/sum(diag(Sigma))
#loop
obj_value_record <- Sigma_diff_record <- rep(NA, max_iter)
for (k in 1:max_iter) {
Sigma_prev <- Sigma
#Tyler update
weights <- 1/rowSums(X * (X %*% solve(Sigma))) # 1/diag( X %*% inv(Sigma) %*% t(X) )
obj_value_record[k] <- - (N/2)*sum(log(weights)) + (T/2)*sum(log(eigen(Sigma)$values))
Sigma <- (N/T) * crossprod( sqrt(weights)*X ) # (N/T) * t(X) %*% diag(weights) %*% X
Sigma <- Sigma/sum(diag(Sigma))
#stopping criterion
Sigma_diff_record[k] <- norm(Sigma - Sigma_prev, "F")/norm(Sigma_prev, "F")
if (Sigma_diff_record[k] < error_th_Sigma)
break
}
obj_value_record <- obj_value_record[1:k]
Sigma_diff_record <- Sigma_diff_record[1:k]
if (verbose)
plot(obj_value_record, type = "b", col = "blue",
main = "Convergence of objective value", xlab = "iterations", ylab = "obj value")
#finally, recover missing scaling factor
sigma2 <- apply(X, 2, var)
d <- diag(Sigma)
kappa <- sum(sigma2*d)/sum(d*d)
Sigma <- kappa*Sigma
return(Sigma)
}
R_Tyler <- estimateTyler(X, verbose = TRUE)
# estimation error
norm(R_classical - R_cov, "F")
#> [1] 154.5947
norm(R_Tyler - R_cov, "F")
#> [1] 94.29244
It is now clear that Tyler’s estimator is much better than the naive sample covariance matrix when the data is heavy-tailed. But what if the data was Gaussian? What would be the penalty for using Tyler when the sample covariance matrix would have been perfectly fine?
# generate Gaussian synthetic return data
N <- 200
T <- 1000
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvnorm(n = T, mean = mu, sigma = R_cov)
# estimators
R_classical <- (1/T) * t(X) %*% X
R_Tyler <- estimateTyler(X)
# estimation error
norm(R_classical - R_cov, "F")
#> [1] 94.96846
norm(R_Tyler - R_cov, "F")
#> [1] 95.59725
Interestingly, we virtually don’t really lose anything! So we better stick to heavy-tailed estimators all the time, just in case.
Let’s generate heavy-tailed data with a nonzero mean.
# generate heavy-tailed synthetic return data
nu <- 4 # degrees of freedom (measures how heavy the tail is)
N <- 200
T <- 1000
mu <- rnorm(N) # nonzero mean!
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)
# estimators assuming wrongly zero-mean
R_classical <- (1/T) * t(X) %*% X
R_Tyler <- estimateTyler(X)
norm(R_classical - R_cov, "F")
#> [1] 357.3742
norm(R_Tyler - R_cov, "F")
#> [1] 436.8639
We need to demean the data. Let’s use the sample mean \[\boldsymbol{\mu} = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t.\]
# let's remove the mean first
mu_classical <- colMeans(X)
norm(mu_classical - mu, "2")
#> [1] 1.865218
X_ <- X - rep(mu_classical, each = T)
R_classical <- (1/T) * t(X_) %*% X_
R_Tyler <- estimateTyler(X_)
norm(R_classical - R_cov, "F")
#> [1] 266.6053
norm(R_Tyler - R_cov, "F")
#> [1] 106.9575
The sample mean is not robust for heavy-tailed distributions, so we could use a robust estimator like the median.
# let's remove the median now
mu_median <- apply(X, 2, median)
norm(mu_median - mu, "2")
#> [1] 1.592109
X_ <- X - rep(mu_median, each = T)
R_classical <- (1/T) * t(X_) %*% X_
R_Tyler <- estimateTyler(X_)
norm(R_classical - R_cov, "F")
#> [1] 266.8546
norm(R_Tyler - R_cov, "F")
#> [1] 106.7559
We don’t observe any significant improvement from using the median instead of the mean. We can try a more sophisticated robust estimation of the mean: the geometric median (aka spatial median).
The geometric median of a discrete set of sample points in a Euclidean space is the point minimizing the sum of distances to the sample points: \[\underset{\boldsymbol{\mu}}{\text{minimize}}\quad \sum_{t=1}^T\|\mathbf{x}_t-\boldsymbol{\mu}\|\] For the 1-dimensional case, it becomes the traditional median.
Let’s try the geometric median then.
library(ICSNP)
#> Loading required package: ICS
# let's remove the geometric median now
mu_gmedian <- ICSNP::spatial.median(X)
norm(mu_gmedian - mu, "2")
#> [1] 1.380634
X_ <- X - rep(mu_gmedian, each = T)
R_classical <- (1/T) * t(X_) %*% X_
R_Tyler <- estimateTyler(X_)
norm(R_classical - R_cov, "F")
#> [1] 266.8296
norm(R_Tyler - R_cov, "F")
#> [1] 106.8262
The estimation of the \(\mathbf{\mu}\) has been significantly improved, but the estimation of the covariance matrix \(\mathbf{\Sigma}\) does not seem to be improved.
Until now, we have first estimated the mean vector (with estimators like the sample mean, median, and geometric median) and then the covariance matrix (with estimators like the sample covariance matrix and Tyler’s estimtor). In principle, this two-step procedure does not have to be optimal. We could try a joint estimation of the mean vector and the covariance matrix under a heavy-tailed distribution. The algorithm becomes a bit more involved and has already been implemented in the R package fitHeavyTail
which we will use.
library(fitHeavyTail) # for Student-t estimation
# generate heavy-tailed synthetic return data
nu <- 4
N <- 200
T <- 1000
mu <- rnorm(N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)
# estimate the mean vector
mu_mean <- colMeans(X)
mu_median <- apply(X, 2, median)
mu_gmedian <- ICSNP::spatial.median(X)
norm(mu_mean - mu, "2")
#> [1] 1.7901
norm(mu_median - mu, "2")
#> [1] 1.717805
norm(mu_gmedian - mu, "2")
#> [1] 1.315567
# estimate the covariance matrix
R_SCM <- cov(X)
norm(R_SCM - R_cov, "F")
#> [1] 163.7383
X_ <- X - rep(mu_gmedian, each = T)
R_Tyler <- estimateTyler(X_)
norm(R_Tyler - R_cov, "F")
#> [1] 94.88541
# joint estimation of mean vector and covariance matrix
res_Student_t <- fit_mvt(X)
norm(res_Student_t$mu - mu, "2")
#> [1] 1.230787
norm(res_Student_t$cov - R_cov, "F")
#> [1] 98.62728
Basically, the estimation of the mean vector is greatly improved by the joint estimation, but the estimation of the covariance matrix does not change much.
In the previous examples, the number of observations was much larger than the dimensionality, i.e., \(T>>N\). When this is not the case, the estimation becomes worse and we may not even be able to use the sophisticated estimators for \(T<N\). This is the so-called small sample regime. An effective solution to this problem is to shrink the estimation to a given target. Even in the absence of a reasonable target, one can always shrink towards the covariance matrix towards the scaled identity matrix. For example, the sample covariance matrix \(\boldsymbol{\Sigma}=\frac{1}{T-1}\mathbf{X}^T\mathbf{X}\) becomes \[\boldsymbol{\Sigma} = (1-\alpha)\frac{1}{T-1}\mathbf{X}^T\mathbf{X} + \alpha\boldsymbol{\Sigma}_{target} \] where \(\boldsymbol{\Sigma}_{target}\) if the target matrix and \(\alpha\) is the shrinkage factor.
# generate small-regime heavy-tailed synthetic return data
set.seed(357)
nu <- 4
N <- 200
T <- 150
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)
# sample covariance matrix without shrinkage
R_SCM <- (1/T) * t(X) %*% X
norm(R_SCM - R_cov, "F")
#> [1] 504.3114
# sample covariance matrix with shrinkage
R_target <- mean(diag(cov(X))) * diag(N) # shrinkage target
alpha <- 0.5 #shrinkage factor
R_SCM_shrinked <- (1-alpha)*R_SCM + alpha*R_target
norm(R_SCM_shrinked - R_cov, "F")
#> [1] 283.0223
It is astounding the improved results that one can achieve with shrinkage even without choosing properly the target matrix and the shrinkage factor!
Let’s refine the Tyler estimator with shrinkage as well.
# let's define the Tyler estimator with shrinkage
estimateTylerShrinkage <- function(X, R_target, alpha, verbose = FALSE) {
max_iter <- 100
error_th_Sigma <- 1e-3
#Gaussian initial point
Sigma <- 1/(1+alpha)*cov(X) + alpha/(1+alpha)*R_target # <--- this line is new
Sigma <- Sigma/sum(diag(Sigma))
#loop
obj_value_record <- Sigma_diff_record <- rep(NA, max_iter)
for (k in 1:max_iter) {
Sigma_prev <- Sigma
#Tyler update
weights <- 1/rowSums(X * (X %*% solve(Sigma))) # 1/diag( X %*% inv(Sigma) %*% t(X) )
obj_value_record[k] <- - (N/2)*sum(log(weights)) + (T/2)*sum(log(eigen(Sigma)$values))
Sigma <- (N/T) * crossprod( sqrt(weights)*X ) # (N/T) * t(X) %*% diag(weights) %*% X
Sigma <- 1/(1+alpha)*Sigma + alpha/(1+alpha)*R_target # <--- this line is new
Sigma <- Sigma/sum(diag(Sigma))
#stopping criterion
Sigma_diff_record[k] <- norm(Sigma - Sigma_prev, "F")/norm(Sigma_prev, "F")
if (Sigma_diff_record[k] < error_th_Sigma)
break
}
obj_value_record <- obj_value_record[1:k]
Sigma_diff_record <- Sigma_diff_record[1:k]
if (verbose)
plot(obj_value_record, type = "l", col = "blue",
main = "Convergence of objective value", xlab = "iterations", ylab = "obj value")
#finally, recover missing scaling factor
sigma2 <- apply(X, 2, var)
d <- diag(Sigma)
kappa <- sum(sigma2*d)/sum(d*d)
Sigma <- kappa*Sigma
return(Sigma)
}
# recall the the error of the sample covariance matrix
R_SCM <- (1/T) * t(X) %*% X
norm(R_SCM - R_cov, "F")
#> [1] 504.3114
# recall the error of the sample covariance matrix with shrinkage
R_target <- mean(diag(cov(X))) * diag(N) # shrinkage target
alpha <- 0.5 #shrinkage factor
R_SCM_shrinked <- (1-alpha)*R_SCM + alpha*R_target
norm(R_SCM_shrinked - R_cov, "F")
#> [1] 283.0223
# now Tyler with shrinkage
R_Tyler_shrinked <- estimateTylerShrinkage(X, R_target, alpha)
norm(R_Tyler_shrinked - R_cov, "F")
#> [1] 237.4451
library(mvtnorm) # for heavy-tailed generation
library(rbokeh) # for nice plots
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
library(fitHeavyTail) # for Student-t estimation
# parameters
set.seed(357)
nu <- 4
N <- 40
mu <- rnorm(N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
R_scale <- (nu-2)/nu*R_cov # R_cov=nu/(nu-2)*R_scale
# main loop
T_sweep <- seq(from = 15, by = 20, to = 800)
error_mu_mean <- error_mu_median <- error_mu_gmedian <- error_mu_Student_t <- rep(NA, length(T_sweep))
error_R_Tyler <- error_R_Student_t <- error_R_SCM <- rep(NA, length(T_sweep))
error_R_Tyler_shrinked <- error_R_SCM_shrinked <- rep(NA, length(T_sweep))
for(k in 1:length(T_sweep)) {
T <- T_sweep[k]
# generate heavy-tailed data
X <- rmvt(n = T, delta = mu, sigma = R_scale, df = nu)
# estimation of mu
mu_mean <- colMeans(X)
mu_median <- apply(X, 2, median)
mu_gmedian <- ICSNP::spatial.median(X)
error_mu_mean[k] <- norm(mu_mean - mu, "2")
error_mu_median[k] <- norm(mu_median - mu, "2")
error_mu_gmedian[k] <- norm(mu_gmedian - mu, "2")
# estimation of R
# SCM
R_SCM <- cov(X)
error_R_SCM[k] <- norm(R_SCM - R_cov, "F")
# Tyler
X_ <- X - rep(mu_gmedian, each = T)
if (T>N) {
R_Tyler <- estimateTyler(X_)
error_R_Tyler[k] <- norm(R_Tyler - R_cov, "F")
}
# Student-t
if (T>N) {
res_Student_t <- fit_mvt(X)
mu_Student_t <- res_Student_t$mu
R_Student_t <- res_Student_t$cov
error_R_Student_t[k] <- norm(R_Student_t - R_cov, "F")
error_mu_Student_t[k] <- norm(mu_Student_t - mu, "2")
}
# shrinkage estimators
alpha <- min(2*N/T, 1)
R_target <- mean(diag(cov(X))) * diag(N)
R_SCM_shrinked <- (1-alpha)*cov(X) + alpha*R_target
error_R_SCM_shrinked[k] <- norm(R_SCM_shrinked - R_cov, "F")
alpha <- min(0.01*N/T, 1)
R_Tyler_shrinked <- estimateTylerShrinkage(X_, R_target, alpha)
error_R_Tyler_shrinked[k] <- norm(R_Tyler_shrinked - R_cov, "F")
}
# plots
figure(width = 800, title = sprintf("Estimation error of mu (N = %d)", N),
xlab="T", ylab="error", legend_location = "top_right") %>%
theme_title(text_font_size = "14pt") %>%
ly_lines(T_sweep, error_mu_mean, color = "blue", width = 2, legend = "mean") %>%
ly_lines(T_sweep, error_mu_median, color = "orange", width = 2, legend = "median") %>%
ly_lines(T_sweep, error_mu_gmedian, color = "green", width = 2, legend = "gmedian") %>%
ly_lines(T_sweep, error_mu_Student_t, color = "purple", width = 2, legend = "Student-t")
figure(width = 800, title = sprintf("Estimation error of covariance matrix (N = %d)", N),
xlab="T", ylab="error", legend_location = "top_right") %>%
theme_title(text_font_size = "14pt") %>%
ly_lines(T_sweep, error_R_SCM, color = "blue", width = 2, legend = "SCM") %>%
ly_lines(T_sweep, error_R_SCM_shrinked, color = "blue", width = 2, type = 2,
legend = "SCM + shrinkage") %>%
ly_lines(T_sweep, error_R_Tyler, color = "orange", width = 2, legend = "Tyler") %>%
ly_lines(T_sweep, error_R_Tyler_shrinked, color = "orange", width = 2, type = 2,
legend = "Tyler + shrinkage") %>%
ly_lines(T_sweep, error_R_Student_t, color = "purple", width = 2, legend = "Student-t")
The final conclusion is clear: use a heavy-tailed estimator!! And even better if the mean vector and covariance matrix are jointly estimated (see the R package fitHeavyTail
).