This R session will illustrate the need for heavy-tailed estimators when it comes to financial data (the R package fitHeavyTail is specifically designed for that purpose).

(Useful R links: Cookbook R, Quick-R, R documentation, CRAN, METACRAN.)

Is financial data heavy-tailed?

Let’s load the S&P 500 index:

library(quantmod)
GSPC <- getSymbols("^GSPC", from = "2010-01-01", to = "2015-12-31", auto.assign = FALSE)
prices <- Cl(GSPC)
head(log(prices))
#>            GSPC.Close
#> 2010-01-04   7.032615
#> 2010-01-05   7.035726
#> 2010-01-06   7.036272
#> 2010-01-07   7.040265
#> 2010-01-08   7.043142
#> 2010-01-11   7.044888
tail(log(prices))
#>            GSPC.Close
#> 2015-12-22   7.620200
#> 2015-12-23   7.632542
#> 2015-12-24   7.630942
#> 2015-12-28   7.628761
#> 2015-12-29   7.639334
#> 2015-12-30   7.632091

Let’s do some plotting:

# plot log-prices
plot(log(prices), col = 'blue', lwd = 2, ylab = "log-price", main = "S&P 500 index")

# plot log-returns
R_daily <- diff(log(prices))[-1]
plot(R_daily, col = 'blue', lwd = 1, ylab = "log-return", main = "S&P 500 index")

Now, let’s see how heavy the tails are:

# histograms to illustrate heavy-tails and gain/loss asymmetry
h <- hist(as.vector(R_daily), breaks = 100, prob = TRUE, col = "lightgray", 
          xlab = "return", main = "Histogram of log-returns")
xfit <- seq(min(R_daily), max(R_daily), length = 100) 
yfit <- dnorm(xfit, mean = mean(R_daily)+5e-4, sd = 0.55*sd(R_daily))
lines(xfit, yfit, col = "blue", lwd=2)

# QQ-plot
qqnorm(R_daily, col="blue", main="QQ plot of log-returns")
qqline(R_daily, lwd=2)

We can positively conclude that financial data is clearly heavy-tailed. Therefore, using traditional sample estimators is probably going to be pretty bad, which we will explore next.

Classical estimators

Let’s warm up using the classical estimators for the mean and covariance matrix, namely the sample mean and the sample covariance matrix: \[ \begin{aligned} \boldsymbol{\mu} & = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t\\ \boldsymbol{\Sigma} & = \frac{1}{T-1}\sum_{t=1}^T(\mathbf{x}_t-\boldsymbol{\mu})(\mathbf{x}_t-\boldsymbol{\mu})^T. \end{aligned} \]

# generate Gaussian synthetic return data
library(mvtnorm)
set.seed(357)
N <- 200
T <- 1000
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvnorm(n = T, mean = mu, sigma = R_cov)
str(X)
#>  num [1:1000, 1:200] 0.86 -3.206 2.803 -7.817 0.402 ...

# classical mean estimator (sample mean)
mu_classical <- colMeans(X)
norm(mu_classical - mu, "2")
#> [1] 1.655746

# classical covariance estimator (sample covariance matrix)
R_classical <- cov(X)
norm(R_classical - R_cov, "F")
#> [1] 97.09152

Data with outliers

Now, let’s add some of outliers in the data and see how the performance degrades significantly.

library(ellipse)
set.seed(42)

# introduce outliers
num_ouliers <- round(0.05*T)
X_outliers <- X
t_indices <- sample(T, num_ouliers)
n_indices <- sample(N, num_ouliers)
ave_std <- sqrt(mean(diag(R_cov)))
X_outliers[cbind(t_indices, n_indices)] <- 10*ave_std*X_outliers[cbind(t_indices, n_indices)]

# scatter plot
i1 <- n_indices[1]
i2 <- n_indices[2]
plot(X[, i1], X[, i2], 
     main = "Scatter plot of Gaussian returns", xlab = "asset 1", ylab = "asset 2",
     col = rgb(0, 100, 0, 100, maxColorValue = 255), pch = 16)
lines(ellipse(cov(X[, c(i1, i2)])), col = "black", lwd = 2)
lines(ellipse(cov(X_outliers[, c(i1, i2)])), col = "blue", lwd = 2)

plot(X_outliers[, i1], X_outliers[, i2], 
     main = "Scatter plot of Gaussian returns with outliers", xlab = "asset 1", ylab = "asset 2",
     col = rgb(0, 100, 0, 100, maxColorValue = 255), pch = 16)
lines(ellipse(cov(X[, c(i1, i2)])), col = "black", lwd = 2)
lines(ellipse(cov(X_outliers[, c(i1, i2)])), col = "blue", lwd = 2)

# classical mean estimators
mu_classical <- colMeans(X_outliers)
norm(mu_classical - mu, "2")
#> [1] 2.006611
R_classical <- cov(X_outliers)
norm(R_classical - R_cov, "F")
#> [1] 362.3363

Heavy-tailed data

Let’s now generate heavy-tailed synthetic data and see the performance of the classical sample estimators.

set.seed(42)

# generate heavy-tailed synthetic return data
nu <- 4  # degrees of freedom (measures how heavy the tail is)
N <- 200
T <- 1000
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)

# scatter plot
tmp <- sample(N, 2)  #choose two assets randomly
i1 <- tmp[1]
i2 <- tmp[2]
plot(X[, i1], X[, i2], 
     main = "Scatter plot of heavy-tailed returns", xlab = "asset 1", ylab = "asset 2",
     col = rgb(0, 100, 0, 100, maxColorValue = 255), pch = 16)
lines(ellipse(R_cov[c(i1,i2), c(i1,i2)]), col = "black", lwd = 2)
lines(ellipse(cov(X[, c(i1, i2)])), col = "blue", lwd = 2)

# classical mean estimators
mu_classical <- colMeans(X)
norm(mu_classical - mu, "2")
#> [1] 1.479279
R_classical <- cov(X)
norm(R_classical - R_cov, "F")
#> [1] 154.7008

Estimation of \(\boldsymbol{\Sigma}\) via Tyler’s estimator (assuming zero mean)

Let’s now try the Tyler estimator for heavy-tailed distributions. Assuming zero mean, the sample covariance matrix is \[ \boldsymbol{\Sigma} = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t\mathbf{x}_t^T \] whereas the Tyler estimator is the fixed-point equation: \[ \boldsymbol{\Sigma} = \frac{N}{T}\sum_{t=1}^T\frac{\mathbf{x}_t\mathbf{x}_t^T}{\mathbf{x}_t^T\boldsymbol{\Sigma}^{-1}\mathbf{x}_t}, \] which requires an iterative algorithm in practice.

# sample covariance matrix cov(X)
R_classical <- (1/T) * t(X) %*% X

# let's define a function for Tyler's estimator
estimateTyler <- function(X, verbose = FALSE) {
  max_iter <- 100
  error_th_Sigma <- 1e-3

  #Gaussian initial point
  Sigma <- cov(X)  
  Sigma <- Sigma/sum(diag(Sigma))
  #loop
  obj_value_record <- Sigma_diff_record <- rep(NA, max_iter)
  for (k in 1:max_iter) {
    Sigma_prev <- Sigma

    #Tyler update
    weights <- 1/rowSums(X * (X %*% solve(Sigma)))   # 1/diag( X %*% inv(Sigma) %*% t(X) )
    obj_value_record[k] <- - (N/2)*sum(log(weights)) + (T/2)*sum(log(eigen(Sigma)$values))
    Sigma <- (N/T) * crossprod( sqrt(weights)*X )  # (N/T) * t(X) %*% diag(weights) %*% X
    Sigma <- Sigma/sum(diag(Sigma))

    #stopping criterion
    Sigma_diff_record[k] <- norm(Sigma - Sigma_prev, "F")/norm(Sigma_prev, "F")
    if (Sigma_diff_record[k] < error_th_Sigma)
      break
  }
  obj_value_record <- obj_value_record[1:k]
  Sigma_diff_record <- Sigma_diff_record[1:k]
  if (verbose)
    plot(obj_value_record, type = "b", col = "blue",
         main = "Convergence of objective value", xlab = "iterations", ylab = "obj value")

  #finally, recover missing scaling factor
  sigma2 <- apply(X, 2, var)
  d <- diag(Sigma)
  kappa <- sum(sigma2*d)/sum(d*d)
  Sigma <- kappa*Sigma
  
  return(Sigma)
}

R_Tyler <- estimateTyler(X, verbose = TRUE)

# estimation error
norm(R_classical - R_cov, "F")
#> [1] 154.5947
norm(R_Tyler - R_cov, "F")
#> [1] 94.29244

It is now clear that Tyler’s estimator is much better than the naive sample covariance matrix when the data is heavy-tailed. But what if the data was Gaussian? What would be the penalty for using Tyler when the sample covariance matrix would have been perfectly fine?

# generate Gaussian synthetic return data
N <- 200
T <- 1000
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvnorm(n = T, mean = mu, sigma = R_cov)

# estimators
R_classical <- (1/T) * t(X) %*% X
R_Tyler     <- estimateTyler(X)

# estimation error
norm(R_classical - R_cov, "F")
#> [1] 94.96846
norm(R_Tyler     - R_cov, "F")
#> [1] 95.59725

Interestingly, we virtually don’t really lose anything! So we better stick to heavy-tailed estimators all the time, just in case.

Estimation first of \(\boldsymbol{\mu}\) and then of \(\boldsymbol{\Sigma}\) via Tyler’s estimator

Let’s generate heavy-tailed data with a nonzero mean.

# generate heavy-tailed synthetic return data
nu <- 4  # degrees of freedom (measures how heavy the tail is)
N <- 200
T <- 1000
mu <- rnorm(N)  # nonzero mean!
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)

# estimators assuming wrongly zero-mean
R_classical <- (1/T) * t(X) %*% X
R_Tyler     <- estimateTyler(X)
norm(R_classical - R_cov, "F")
#> [1] 357.3742
norm(R_Tyler     - R_cov, "F")
#> [1] 436.8639

We need to demean the data. Let’s use the sample mean \[\boldsymbol{\mu} = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t.\]

# let's remove the mean first
mu_classical <- colMeans(X)
norm(mu_classical - mu, "2")
#> [1] 1.865218
X_ <- X - rep(mu_classical, each = T)
R_classical <- (1/T) * t(X_) %*% X_
R_Tyler     <- estimateTyler(X_)
norm(R_classical - R_cov, "F")
#> [1] 266.6053
norm(R_Tyler     - R_cov, "F")
#> [1] 106.9575

The sample mean is not robust for heavy-tailed distributions, so we could use a robust estimator like the median.

# let's remove the median now
mu_median <- apply(X, 2, median)
norm(mu_median - mu, "2")
#> [1] 1.592109
X_ <- X - rep(mu_median, each = T)
R_classical <- (1/T) * t(X_) %*% X_
R_Tyler     <- estimateTyler(X_)
norm(R_classical - R_cov, "F")
#> [1] 266.8546
norm(R_Tyler     - R_cov, "F")
#> [1] 106.7559

We don’t observe any significant improvement from using the median instead of the mean. We can try a more sophisticated robust estimation of the mean: the geometric median (aka spatial median).

The geometric median of a discrete set of sample points in a Euclidean space is the point minimizing the sum of distances to the sample points: \[\underset{\boldsymbol{\mu}}{\text{minimize}}\quad \sum_{t=1}^T\|\mathbf{x}_t-\boldsymbol{\mu}\|\] For the 1-dimensional case, it becomes the traditional median.

Let’s try the geometric median then.

library(ICSNP)
#> Loading required package: ICS

# let's remove the geometric median now
mu_gmedian <- ICSNP::spatial.median(X)
norm(mu_gmedian - mu, "2")
#> [1] 1.380634
X_ <- X - rep(mu_gmedian, each = T)
R_classical <- (1/T) * t(X_) %*% X_
R_Tyler     <- estimateTyler(X_)
norm(R_classical - R_cov, "F")
#> [1] 266.8296
norm(R_Tyler     - R_cov, "F")
#> [1] 106.8262

The estimation of the \(\mathbf{\mu}\) has been significantly improved, but the estimation of the covariance matrix \(\mathbf{\Sigma}\) does not seem to be improved.

Joint estimation of \(\boldsymbol{\mu}\) and \(\boldsymbol{\Sigma}\) via Student-t’s MLE

Until now, we have first estimated the mean vector (with estimators like the sample mean, median, and geometric median) and then the covariance matrix (with estimators like the sample covariance matrix and Tyler’s estimtor). In principle, this two-step procedure does not have to be optimal. We could try a joint estimation of the mean vector and the covariance matrix under a heavy-tailed distribution. The algorithm becomes a bit more involved and has already been implemented in the R package fitHeavyTail which we will use.

library(fitHeavyTail)  # for Student-t estimation
# generate heavy-tailed synthetic return data
nu <- 4
N <- 200
T <- 1000
mu <- rnorm(N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)

# estimate the mean vector
mu_mean    <- colMeans(X)
mu_median  <- apply(X, 2, median)
mu_gmedian <- ICSNP::spatial.median(X)
norm(mu_mean    - mu, "2")
#> [1] 1.7901
norm(mu_median  - mu, "2")
#> [1] 1.717805
norm(mu_gmedian - mu, "2")
#> [1] 1.315567

# estimate the covariance matrix
R_SCM <- cov(X)
norm(R_SCM - R_cov, "F")
#> [1] 163.7383
  
X_ <- X - rep(mu_gmedian, each = T)
R_Tyler <- estimateTyler(X_)
norm(R_Tyler - R_cov, "F")
#> [1] 94.88541

# joint estimation of mean vector and covariance matrix
res_Student_t <- fit_mvt(X)
norm(res_Student_t$mu  - mu, "2")
#> [1] 1.230787
norm(res_Student_t$cov - R_cov, "F")
#> [1] 98.62728

Basically, the estimation of the mean vector is greatly improved by the joint estimation, but the estimation of the covariance matrix does not change much.

Shrinkage for small sample regime

In the previous examples, the number of observations was much larger than the dimensionality, i.e., \(T>>N\). When this is not the case, the estimation becomes worse and we may not even be able to use the sophisticated estimators for \(T<N\). This is the so-called small sample regime. An effective solution to this problem is to shrink the estimation to a given target. Even in the absence of a reasonable target, one can always shrink towards the covariance matrix towards the scaled identity matrix. For example, the sample covariance matrix \(\boldsymbol{\Sigma}=\frac{1}{T-1}\mathbf{X}^T\mathbf{X}\) becomes \[\boldsymbol{\Sigma} = (1-\alpha)\frac{1}{T-1}\mathbf{X}^T\mathbf{X} + \alpha\boldsymbol{\Sigma}_{target} \] where \(\boldsymbol{\Sigma}_{target}\) if the target matrix and \(\alpha\) is the shrinkage factor.

# generate small-regime heavy-tailed synthetic return data
set.seed(357)
nu <- 4
N <- 200
T <- 150
mu <- rep(0, N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
X <- rmvt(n = T, delta = mu, sigma = (nu-2)/nu*R_cov, df = nu)

# sample covariance matrix without shrinkage
R_SCM <- (1/T) * t(X) %*% X
norm(R_SCM - R_cov, "F")
#> [1] 504.3114

# sample covariance matrix with shrinkage
R_target <- mean(diag(cov(X))) * diag(N)  # shrinkage target
alpha <- 0.5  #shrinkage factor
R_SCM_shrinked <- (1-alpha)*R_SCM + alpha*R_target
norm(R_SCM_shrinked - R_cov, "F")
#> [1] 283.0223

It is astounding the improved results that one can achieve with shrinkage even without choosing properly the target matrix and the shrinkage factor!

Let’s refine the Tyler estimator with shrinkage as well.

# let's define the Tyler estimator with shrinkage
estimateTylerShrinkage <- function(X, R_target, alpha, verbose = FALSE) {
  max_iter <- 100
  error_th_Sigma <- 1e-3

  #Gaussian initial point
  Sigma <- 1/(1+alpha)*cov(X) + alpha/(1+alpha)*R_target  # <--- this line is new
  Sigma <- Sigma/sum(diag(Sigma))
  #loop
  obj_value_record <- Sigma_diff_record <- rep(NA, max_iter)
  for (k in 1:max_iter) {
    Sigma_prev <- Sigma

    #Tyler update
    weights <- 1/rowSums(X * (X %*% solve(Sigma)))   # 1/diag( X %*% inv(Sigma) %*% t(X) )
    obj_value_record[k] <- - (N/2)*sum(log(weights)) + (T/2)*sum(log(eigen(Sigma)$values))
    Sigma <- (N/T) * crossprod( sqrt(weights)*X )  # (N/T) * t(X) %*% diag(weights) %*% X
    Sigma <- 1/(1+alpha)*Sigma + alpha/(1+alpha)*R_target  # <--- this line is new
    Sigma <- Sigma/sum(diag(Sigma))

    #stopping criterion
    Sigma_diff_record[k] <- norm(Sigma - Sigma_prev, "F")/norm(Sigma_prev, "F")
    if (Sigma_diff_record[k] < error_th_Sigma)
      break
  }
  obj_value_record <- obj_value_record[1:k]
  Sigma_diff_record <- Sigma_diff_record[1:k]
  if (verbose)
    plot(obj_value_record, type = "l", col = "blue",
         main = "Convergence of objective value", xlab = "iterations", ylab = "obj value")

  #finally, recover missing scaling factor
  sigma2 <- apply(X, 2, var)
  d <- diag(Sigma)
  kappa <- sum(sigma2*d)/sum(d*d)
  Sigma <- kappa*Sigma
  
  return(Sigma)
}

# recall the the error of the sample covariance matrix
R_SCM <- (1/T) * t(X) %*% X
norm(R_SCM - R_cov, "F")
#> [1] 504.3114

# recall the error of the sample covariance matrix with shrinkage
R_target <- mean(diag(cov(X))) * diag(N)  # shrinkage target
alpha <- 0.5  #shrinkage factor
R_SCM_shrinked <- (1-alpha)*R_SCM + alpha*R_target
norm(R_SCM_shrinked - R_cov, "F")
#> [1] 283.0223

# now Tyler with shrinkage
R_Tyler_shrinked <- estimateTylerShrinkage(X, R_target, alpha)
norm(R_Tyler_shrinked - R_cov, "F")
#> [1] 237.4451

Final comparison of all methods

library(mvtnorm)  # for heavy-tailed generation
library(rbokeh)  # for nice plots
#> Registered S3 method overwritten by 'pryr':
#>   method      from
#>   print.bytes Rcpp
library(fitHeavyTail)  # for Student-t estimation

# parameters
set.seed(357)
nu <- 4
N <- 40
mu <- rnorm(N)
U <- t(rmvnorm(n = round(0.7*N), sigma = 0.1*diag(N)))
R_cov <- U %*% t(U) + diag(N)
R_scale <- (nu-2)/nu*R_cov   # R_cov=nu/(nu-2)*R_scale

# main loop
T_sweep <- seq(from = 15, by = 20, to = 800)
error_mu_mean <- error_mu_median <- error_mu_gmedian <- error_mu_Student_t <- rep(NA, length(T_sweep))
error_R_Tyler <- error_R_Student_t <- error_R_SCM <- rep(NA, length(T_sweep))
error_R_Tyler_shrinked  <- error_R_SCM_shrinked <- rep(NA, length(T_sweep))
for(k in 1:length(T_sweep)) {
  T <- T_sweep[k]
  
  # generate heavy-tailed data
  X <- rmvt(n = T, delta = mu, sigma = R_scale, df = nu)
  
  # estimation of mu
  mu_mean    <- colMeans(X)
  mu_median  <- apply(X, 2, median)
  mu_gmedian <- ICSNP::spatial.median(X)
  error_mu_mean[k]    <- norm(mu_mean - mu, "2")
  error_mu_median[k]  <- norm(mu_median - mu, "2")
  error_mu_gmedian[k] <- norm(mu_gmedian - mu, "2")
  
  # estimation of R
  # SCM
  R_SCM <- cov(X)
  error_R_SCM[k] <- norm(R_SCM - R_cov, "F")
  
  # Tyler
  X_ <- X - rep(mu_gmedian, each = T)
  if (T>N) {
    R_Tyler <- estimateTyler(X_)
    error_R_Tyler[k] <- norm(R_Tyler - R_cov, "F")
  }

  # Student-t
  if (T>N) {
    res_Student_t <- fit_mvt(X)
    mu_Student_t <- res_Student_t$mu
    R_Student_t  <- res_Student_t$cov
    error_R_Student_t[k]  <- norm(R_Student_t  - R_cov, "F")
    error_mu_Student_t[k] <- norm(mu_Student_t - mu, "2")
  }

  # shrinkage estimators
  alpha <- min(2*N/T, 1)
  R_target <- mean(diag(cov(X))) * diag(N)
  R_SCM_shrinked <- (1-alpha)*cov(X) + alpha*R_target
  error_R_SCM_shrinked[k] <- norm(R_SCM_shrinked - R_cov, "F")
  alpha <- min(0.01*N/T, 1)
  R_Tyler_shrinked <- estimateTylerShrinkage(X_, R_target, alpha)
  error_R_Tyler_shrinked[k] <- norm(R_Tyler_shrinked - R_cov, "F")
}

# plots
figure(width = 800, title = sprintf("Estimation error of mu (N = %d)", N), 
       xlab="T", ylab="error", legend_location = "top_right") %>%
  theme_title(text_font_size = "14pt") %>%
  ly_lines(T_sweep, error_mu_mean, color = "blue", width = 2, legend = "mean")  %>%
  ly_lines(T_sweep, error_mu_median, color = "orange", width = 2, legend = "median") %>%
  ly_lines(T_sweep, error_mu_gmedian, color = "green", width = 2, legend = "gmedian") %>%
  ly_lines(T_sweep, error_mu_Student_t, color = "purple", width = 2, legend = "Student-t")
 figure(width = 800, title = sprintf("Estimation error of covariance matrix (N = %d)", N),
        xlab="T", ylab="error", legend_location = "top_right") %>%
   theme_title(text_font_size = "14pt") %>%
   ly_lines(T_sweep, error_R_SCM, color = "blue", width = 2, legend = "SCM")  %>%
   ly_lines(T_sweep, error_R_SCM_shrinked, color = "blue", width = 2, type = 2, 
            legend = "SCM + shrinkage")  %>%
   ly_lines(T_sweep, error_R_Tyler, color = "orange", width = 2, legend = "Tyler") %>%
   ly_lines(T_sweep, error_R_Tyler_shrinked, color = "orange", width = 2, type = 2, 
            legend = "Tyler + shrinkage") %>%
   ly_lines(T_sweep, error_R_Student_t, color = "purple", width = 2, legend = "Student-t")

The final conclusion is clear: use a heavy-tailed estimator!! And even better if the mean vector and covariance matrix are jointly estimated (see the R package fitHeavyTail).