This R session will illustrate the practical implementation and use of factor models.
The R package covFactorModel, available in GitHub, is recommended.
(Useful R links: Cookbook R, Quick-R, R documentation, CRAN, METACRAN.)
We will start with a simple example consisting of a single known factor (i.e., the market index). The model is \[ \mathbf{x}_t = \boldsymbol{\alpha} + \boldsymbol{\beta}f_t + \boldsymbol{\epsilon}_t, \quad t=1,\ldots,T \] where the explicit factor \(f_t\) is the S&P 500 index. We will do a simple least squares (LS) regression to estimate the intercept \(\boldsymbol{\alpha}\) and the loading beta \(\boldsymbol{\beta}\): \[ \underset{\boldsymbol{\alpha},\boldsymbol{\beta}}{\text{minimize}}\quad \sum_{t=1}^T\|\mathbf{x}_t - \boldsymbol{\alpha} - \boldsymbol{\beta}f_t\|^2 \]
Most the code lines go into preparing the data rather than actually performing the factor modeling. Let’s start getting the data ready:
library(xts)
library(quantmod)
# set begin-end date and stock namelist
begin_date <- "2016-01-01"
end_date <- "2017-12-31"
stock_namelist <- c("AAPL", "AMD", "ADI", "ABBV", "AEZS", "A", "APD", "AA","CF")
sector_namelist <- c(rep("Information Technology", 3), rep("Health Care", 3), rep("Materials", 3))
# download data from YahooFinance
data_set <- xts()
for (stock_index in 1:length(stock_namelist))
data_set <- cbind(data_set, Ad(getSymbols(stock_namelist[stock_index],
from = begin_date, to = end_date, auto.assign = FALSE)))
colnames(data_set) <- stock_namelist
indexClass(data_set) <- "Date"
str(data_set)
#> An 'xts' object on 2016-01-04/2017-12-29 containing:
#> Data: num [1:503, 1:9] 98.7 96.3 94.4 90.4 90.9 ...
#> - attr(*, "dimnames")=List of 2
#> ..$ : NULL
#> ..$ : chr [1:9] "AAPL" "AMD" "ADI" "ABBV" ...
#> Indexed by objects of class: [Date] TZ: UTC
#> xts Attributes:
#> NULL
head(data_set)
#> AAPL AMD ADI ABBV AEZS A APD AA CF
#> 2016-01-04 98.74225 2.77 49.99239 49.46063 4.40 39.35598 107.89010 23.00764 35.13227
#> 2016-01-05 96.26781 2.75 49.62508 49.25457 4.21 39.22057 105.96097 21.96506 34.03059
#> 2016-01-06 94.38389 2.51 47.51298 49.26315 3.64 39.39467 103.38042 20.40121 31.08988
#> 2016-01-07 90.40047 2.28 46.30082 49.11721 3.29 37.72138 99.91463 19.59558 29.61520
#> 2016-01-08 90.87848 2.14 45.89677 47.77789 3.29 37.32482 99.39687 19.12169 29.33761
#> 2016-01-11 92.35001 2.34 46.98954 46.25827 3.13 36.69613 99.78938 18.95583 28.14919
tail(data_set)
#> AAPL AMD ADI ABBV AEZS A APD AA CF
#> 2017-12-21 170.3791 10.89 85.53941 90.49536 2.70 66.46960 156.0460 48.99 39.31251
#> 2017-12-22 170.3791 10.54 85.73240 90.77264 2.45 66.30225 156.2947 49.99 39.75090
#> 2017-12-26 166.0566 10.46 85.52011 90.34747 2.33 66.20380 155.8930 50.38 40.83736
#> 2017-12-27 166.0858 10.53 85.97363 90.66173 2.36 66.25303 156.5337 51.84 41.04702
#> 2017-12-28 166.5531 10.55 86.24380 90.38444 2.41 66.40069 157.4423 54.14 40.60863
#> 2017-12-29 164.7521 10.28 85.90609 89.38623 2.36 66.07411 157.8368 53.87 40.54192
SP500_index <- Ad(getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE))
colnames(SP500_index) <- "index"
head(SP500_index)
#> index
#> 2016-01-04 2012.66
#> 2016-01-05 2016.71
#> 2016-01-06 1990.26
#> 2016-01-07 1943.09
#> 2016-01-08 1922.03
#> 2016-01-11 1923.67
plot(SP500_index)
# compute the log-returns of the stocks and of the SP500 index as explicit factor
X <- diff(log(data_set), na.pad = FALSE)
N <- ncol(X) # number of stocks
T <- nrow(X) # number of days
f <- diff(log(SP500_index), na.pad = FALSE)
str(X)
#> An 'xts' object on 2016-01-05/2017-12-29 containing:
#> Data: num [1:502, 1:9] -0.02538 -0.01976 -0.04312 0.00527 0.01606 ...
#> - attr(*, "dimnames")=List of 2
#> ..$ : NULL
#> ..$ : chr [1:9] "AAPL" "AMD" "ADI" "ABBV" ...
#> Indexed by objects of class: [Date] TZ: UTC
#> xts Attributes:
#> NULL
str(f)
#> An 'xts' object on 2016-01-05/2017-12-29 containing:
#> Data: num [1:502, 1] 0.00201 -0.013202 -0.023986 -0.010898 0.000853 ...
#> - attr(*, "dimnames")=List of 2
#> ..$ : NULL
#> ..$ : chr "index"
#> Indexed by objects of class: [Date] TZ: UTC
#> xts Attributes:
#> List of 2
#> $ src : chr "yahoo"
#> $ updated: POSIXct[1:1], format: "2019-09-25 12:50:13"
Now we are ready to do the factor model fitting. The solution to the previous LS fitting is \[ \begin{aligned} \hat{\boldsymbol{\beta}} & = {\sf cov}(\mathbf{x}_t,f_t)/{\sf var}(f_t)\\ \hat{\boldsymbol{\alpha}} & = \bar{x} - \hat{\boldsymbol{\beta}}\bar{f}\\ \hat{\boldsymbol{\epsilon}}_i & = \mathbf{x}_i - \alpha_i\mathbf{1} - \beta_i\mathbf{f}, \quad i=1,\ldots,N\\ \hat{\sigma}_i^2 & = \frac{1}{T-2}\hat{\boldsymbol{\epsilon}}_i^T\hat{\boldsymbol{\epsilon}}_i, \quad \hat{\boldsymbol{\Psi}} = \sf{diag}(\hat{\sigma}_1^2, \ldots, \hat{\sigma}_N^2)\\ \hat{\boldsymbol{\Sigma}} & = {\sf var}(f_t)\hat{\boldsymbol{\beta}}\hat{\boldsymbol{\beta}}^T + \hat{\boldsymbol{\Psi}} \end{aligned} \]
which can be readily implemented in R as follows:
beta <- cov(X,f)/as.numeric(var(f))
alpha <- colMeans(X) - beta*colMeans(f)
sigma2 <- rep(NA, N)
for (i in 1:N) {
eps_i <- X[, i] - alpha[i] - beta[i]*f
sigma2[i] <- (1/(T-2)) * t(eps_i) %*% eps_i
}
Psi <- diag(sigma2)
Sigma <- as.numeric(var(f)) * beta %*% t(beta) + Psi
print(alpha)
#> index
#> AAPL 0.0003999086
#> AMD 0.0013825599
#> ADI 0.0003609968
#> ABBV 0.0006684632
#> AEZS -0.0022091301
#> A 0.0002810616
#> APD 0.0001786375
#> AA 0.0006429140
#> CF -0.0006029705
print(beta)
#> index
#> AAPL 1.0957919
#> AMD 2.1738304
#> ADI 1.2683047
#> ABBV 0.9022748
#> AEZS 1.7115761
#> A 1.3277212
#> APD 1.0239453
#> AA 1.8593524
#> CF 1.5702493
Alternatively, we can do the fitting using a more compact matrix notation (recall time is along the vertical axis in \(\mathbf{X}\)) \[ \mathbf{X}^T = \boldsymbol{\alpha}\mathbf{1}^T + \boldsymbol{\beta}\mathbf{f}^T + \mathbf{E}^T \] and then write the LS fitting as \[ \underset{\boldsymbol{\alpha},\boldsymbol{\beta}}{\text{minimize}}\quad \|\mathbf{X}^T - \boldsymbol{\alpha}\mathbf{1}^T - \boldsymbol{\beta}\mathbf{f}^T\|^2_F. \] More conveniently, we can define \(\boldsymbol{\Gamma} = [ \boldsymbol{\alpha}, \boldsymbol{\beta} ]\) and the extended factors \(\tilde{\mathbf{F}} = [\mathbf{1}, \mathbf{f}]\). The LS formulation can then be written as \[ \underset{\boldsymbol{\Gamma}}{\text{minimize}}\quad \|\mathbf{X}^T - \boldsymbol{\Gamma}\tilde{\mathbf{F}}^T\|^2_F \] with solution \[ \boldsymbol{\Gamma} = \mathbf{X}^T \tilde{\mathbf{F}} (\tilde{\mathbf{F}}^T \tilde{\mathbf{F}})^{-1} \]
F_ <- cbind(ones = 1, f)
Gamma <- t(X) %*% F_ %*% solve(t(F_) %*% F_) # better: Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X))
colnames(Gamma) <- c("alpha", "beta")
alpha <- Gamma[, 1] # or alpha <- Gamma[, "alpha"]
beta <- Gamma[, 2] # or beta <- Gamma[, "beta"]
print(Gamma)
#> alpha beta
#> AAPL 0.0003999086 1.0957919
#> AMD 0.0013825599 2.1738304
#> ADI 0.0003609968 1.2683047
#> ABBV 0.0006684632 0.9022748
#> AEZS -0.0022091301 1.7115761
#> A 0.0002810616 1.3277212
#> APD 0.0001786375 1.0239453
#> AA 0.0006429140 1.8593524
#> CF -0.0006029705 1.5702493
E <- xts(t(t(X) - Gamma %*% t(F_)), index(X)) # residuals
Psi <- (1/(T-2)) * t(E) %*% E
Sigma <- as.numeric(var(f)) * beta %o% beta + diag(diag(Psi))
As expected, we get the same result regardless of the notation used. Alternatively, we can simply use the R package covFactorModel to do the work for us:
library(covFactorModel)
#> Registered S3 method overwritten by 'xts':
#> method from
#> as.zoo.xts zoo
factor_model <- factorModel(X, type = "M", econ_fact = f)
cbind(alpha = factor_model$alpha, beta = factor_model$beta)
#> alpha index
#> AAPL 0.0003999086 1.0957919
#> AMD 0.0013825599 2.1738304
#> ADI 0.0003609968 1.2683047
#> ABBV 0.0006684632 0.9022748
#> AEZS -0.0022091301 1.7115761
#> A 0.0002810616 1.3277212
#> APD 0.0001786375 1.0239453
#> AA 0.0006429140 1.8593524
#> CF -0.0006029705 1.5702493
It is interesting to visualize the estimated covariance matrix of the log-returns \(\boldsymbol{\Sigma} = var(f_t)\boldsymbol{\beta}\boldsymbol{\beta}^T + {\sf Diag}(\boldsymbol{\Psi})\), as well as that of the residuals \(\boldsymbol{\Psi}\).
Let’s start with the covariance matrix of the log-returns:
library(corrplot) #install.packages("corrplot")
corrplot(cov2cor(Sigma), mar = c(0,0,1,0),
main = "Covariance matrix of log-returns from 1-factor model")
We can observe that all the stocks are highly correlated, which is the effect of the market factor. In order to inspect finer details of the interdependency of the stocks, we should first remove the market or, equivalently, plot \(\boldsymbol{\Psi}\):
corrplot(cov2cor(Psi), mar = c(0,0,1,0), order = "hclust", addrect = 3,
main = "Covariance matrix of residuals")
cbind(stock_namelist, sector_namelist) # to recall the sectors
#> stock_namelist sector_namelist
#> [1,] "AAPL" "Information Technology"
#> [2,] "AMD" "Information Technology"
#> [3,] "ADI" "Information Technology"
#> [4,] "ABBV" "Health Care"
#> [5,] "AEZS" "Health Care"
#> [6,] "A" "Health Care"
#> [7,] "APD" "Materials"
#> [8,] "AA" "Materials"
#> [9,] "CF" "Materials"
Interestingly, we can observe that the automatic clustering performed on \(\boldsymbol{\Psi}\) correctly identifies the sectors of the stocks.
In this example, we will assess the performance of several investment funds based on factor models. We will use the S&P 500 as the explicit market factor and assume a zero risk-free return \(r_f=0\). In particular, we will consider the following six Exchange-Traded Funds (ETFs):
We start by loading the data:
library(xts)
library(quantmod)
# set begin-end date and stock namelist
begin_date <- "2016-10-01"
end_date <- "2017-06-30"
stock_namelist <- c("SPY","XIVH", "SPHB", "SPLV", "USMV", "JKD")
# download data from YahooFinance
data_set <- xts()
for (stock_index in 1:length(stock_namelist))
data_set <- cbind(data_set, Ad(getSymbols(stock_namelist[stock_index],
from = begin_date, to = end_date, auto.assign = FALSE)))
colnames(data_set) <- stock_namelist
indexClass(data_set) <- "Date"
head(data_set)
#> SPY XIVH SPHB SPLV USMV JKD
#> 2016-10-03 203.6610 29.400 31.38322 38.55683 42.88382 119.8765
#> 2016-10-04 202.6228 30.160 31.29729 38.10687 42.46553 119.4081
#> 2016-10-05 203.5195 30.160 31.89880 38.02249 42.37048 119.9421
#> 2016-10-06 203.6610 30.160 31.83196 38.08813 42.39899 120.0826
#> 2016-10-07 202.9626 30.670 31.58372 37.98500 42.35146 119.8296
#> 2016-10-10 204.0197 31.394 31.87970 38.18187 42.56060 120.5978
SP500_index <- Ad(getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE))
colnames(SP500_index) <- "index"
head(SP500_index)
#> index
#> 2016-10-03 2161.20
#> 2016-10-04 2150.49
#> 2016-10-05 2159.73
#> 2016-10-06 2160.77
#> 2016-10-07 2153.74
#> 2016-10-10 2163.66
# compute the log-returns of the stocks and of the SP500 index as explicit factor
X <- diff(log(data_set), na.pad = FALSE)
N <- ncol(X) # number of stocks
T <- nrow(X) # number of days
f <- diff(log(SP500_index), na.pad = FALSE)
Now we can compute the alpha and beta of all the ETFs:
F_ <- cbind(ones = 1, f)
Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X))
colnames(Gamma) <- c("alpha", "beta")
alpha <- Gamma[, 1]
beta <- Gamma[, 2]
print(Gamma)
#> alpha beta
#> SPY 7.142225e-05 1.0071424
#> XIVH 1.810392e-03 2.4971086
#> SPHB -2.422107e-04 1.5613533
#> SPLV 1.070918e-04 0.6777149
#> USMV 1.166177e-04 0.6511667
#> JKD 2.569578e-04 0.8883843
Some observations are now in order:
To get a feeling we can use some visualization:
{ par(mfrow = c(1,2)) # two plots side by side
barplot(rev(alpha), horiz = TRUE, main = "alpha", col = "red", cex.names = 0.75, las = 1)
barplot(rev(beta), horiz = TRUE, main = "beta", col = "blue", cex.names = 0.75, las = 1)
par(mfrow = c(1,1)) } # reset to normal single plot
We can also compare the different ETFs in a more systematic way using, for example, the Sharpe ratio. Recalling the factor model for one asset and one factor \[ x_t = \alpha + \beta f_t + \epsilon_t, \quad t=1,\ldots,T, \] we obtain \[ \begin{aligned} E[x_t] & = \alpha + \beta E[f_t]\\ {\sf var}[x_t] & = \beta^2 {\sf var}[f_t] + \sigma^2 \end{aligned} \] and the Sharpe ratio follows: \[ SR = \frac{E[x_t]}{\sqrt{{\sf var}[x_t]}} = \frac{\alpha + \beta E[f_t]}{\sqrt{\beta^2 {\sf var}[f_t] + \sigma^2}} = \frac{\alpha/\beta + E[f_t]}{\sqrt{{\sf var}[f_t] + \sigma^2/\beta^2}} \approx \frac{\alpha/\beta + E[f_t]}{\sqrt{{\sf var}[f_t]}} \] where we have assumed \(\sigma^2 \ll {\sf var}[f_t]\). Thus, a way to rank the different assets based on the Sharpe ratio is to rank them based on the ratio \(\alpha / \beta\):
idx_sorted <- sort(alpha/beta, decreasing = TRUE, index.return = TRUE)$ix
SR <- colMeans(X)/sqrt(diag(var(X)))
ranking <- cbind("alpha/beta" = (alpha/beta)[idx_sorted],
SR = SR[idx_sorted],
alpha = alpha[idx_sorted],
beta = beta[idx_sorted])
print(ranking)
#> alpha/beta SR alpha beta
#> XIVH 7.249952e-04 0.13919483 1.810392e-03 2.4971086
#> JKD 2.892417e-04 0.17682677 2.569578e-04 0.8883843
#> USMV 1.790904e-04 0.12280053 1.166177e-04 0.6511667
#> SPLV 1.580189e-04 0.10887903 1.070918e-04 0.6777149
#> SPY 7.091574e-05 0.14170591 7.142225e-05 1.0071424
#> SPHB -1.551287e-04 0.07401566 -2.422107e-04 1.5613533
The following observations are in order:
This example will illustrate the popular Fama-French 3-factor model using nine stocks from the S&P 500.
Let’s start by loading the data:
library(xts)
library(quantmod)
# set begin-end date and stock namelist
begin_date <- "2013-01-01"
end_date <- "2017-08-31"
stock_namelist <- c("AAPL", "AMD", "ADI", "ABBV", "AEZS", "A", "APD", "AA","CF")
# download data from YahooFinance
data_set <- xts()
for (stock_index in 1:length(stock_namelist))
data_set <- cbind(data_set, Ad(getSymbols(stock_namelist[stock_index],
from = begin_date, to = end_date, auto.assign = FALSE)))
colnames(data_set) <- stock_namelist
indexClass(data_set) <- "Date"
# download Fama-French factors from website
#url <- "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_daily_CSV.zip"
#temp <- tempfile()
#download.file(url, temp, method = "libcurl", mode = "wb")
#unzip(temp, "F-F_Research_Data_Factors_daily.CSV")
#unlink(temp)
mydata <- read.csv("F-F_Research_Data_Factors_daily.CSV", skip = 4)
mydata <- mydata[-nrow(mydata), ] # remove last row
fama_lib <- xts(x = mydata[, c(2,3,4)], order.by = as.Date(paste(mydata[, 1]), "%Y%m%d"))
str(fama_lib)
#> An 'xts' object on 1926-07-01/2017-11-30 containing:
#> Data: num [1:24120, 1:3] 0.1 0.45 0.17 0.09 0.21 -0.71 0.62 0.04 0.48 0.04 ...
#> - attr(*, "dimnames")=List of 2
#> ..$ : NULL
#> ..$ : chr [1:3] "Mkt.RF" "SMB" "HML"
#> Indexed by objects of class: [Date] TZ: UTC
#> xts Attributes:
#> NULL
head(fama_lib)
#> Mkt.RF SMB HML
#> 1926-07-01 0.10 -0.24 -0.28
#> 1926-07-02 0.45 -0.32 -0.08
#> 1926-07-06 0.17 0.27 -0.35
#> 1926-07-07 0.09 -0.59 0.03
#> 1926-07-08 0.21 -0.36 0.15
#> 1926-07-09 -0.71 0.44 0.56
tail(fama_lib)
#> Mkt.RF SMB HML
#> 2017-11-22 -0.05 0.10 -0.04
#> 2017-11-24 0.21 0.02 -0.44
#> 2017-11-27 -0.06 -0.36 0.03
#> 2017-11-28 1.06 0.38 0.84
#> 2017-11-29 0.02 0.04 1.45
#> 2017-11-30 0.82 -0.56 -0.50
# compute the log-returns of the stocks and the Fama-French factors
X <- diff(log(data_set), na.pad = FALSE)
N <- ncol(X) # number of stocks
T <- nrow(X) # number of days
F <- fama_lib[index(X)]/100
Now we have the three explicit factors in matrix \(\mathbf{F}\) and want to fit the model \[ \mathbf{X}^T = \boldsymbol{\alpha}\mathbf{1}^T + \mathbf{B}\mathbf{F}^T + \mathbf{E}^T \] where now the loadings are a matrix of betas: \(\mathbf{B}=[\boldsymbol{\beta}_1, \ldots, \boldsymbol{\beta}_N]^T=[\mathbf{b}_1, \mathbf{b}_2, \mathbf{b}_3]\). As before, we can do a LS fitting as \[ \underset{\boldsymbol{\alpha},\boldsymbol{\beta}}{\text{minimize}}\quad \|\mathbf{X}^T - \boldsymbol{\alpha}\mathbf{1}^T - \mathbf{B}\mathbf{F}^T\|^2_F. \] More conveniently, we define \(\boldsymbol{\Gamma} = [ \boldsymbol{\alpha}, \mathbf{B} ]\) and the extended factors \(\tilde{\mathbf{F}} = [\mathbf{1}, \mathbf{F}]\). The LS formulation can then be written as \[ \underset{\boldsymbol{\alpha},\boldsymbol{\beta}}{\text{minimize}}\quad \|\mathbf{X}^T - \boldsymbol{\Gamma}\tilde{\mathbf{F}}^T\|^2_F \] with solution \[ \boldsymbol{\Gamma} = \mathbf{X}^T \tilde{\mathbf{F}} (\tilde{\mathbf{F}}^T \tilde{\mathbf{F}})^{-1} \]
F_ <- cbind(ones = 1, F)
Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X))
colnames(Gamma) <- c("alpha", "b1", "b2", "b3")
alpha <- Gamma[, 1]
B <- Gamma[, 2:4]
print(Gamma)
#> alpha b1 b2 b3
#> AAPL 1.437845e-04 0.9657612 -0.23339130 -0.49806858
#> AMD 6.181760e-04 1.4062105 0.80738336 -0.07240117
#> ADI -2.285017e-05 1.2124008 0.09025928 -0.20739271
#> ABBV 1.621380e-04 1.0582340 0.02833584 -0.72152627
#> AEZS -4.513235e-03 0.6989534 1.31318108 -0.25160182
#> A 1.146100e-05 1.2181429 0.10370898 -0.20487290
#> APD 6.281504e-05 1.0222936 -0.04394061 0.11060938
#> AA -4.587722e-05 1.3391852 0.62590136 0.99858692
#> CF -5.777426e-04 1.0387867 0.48430007 0.82014523
Alternatively, we can simply use the R package covFactorModel to do the work for us:
library(covFactorModel) # devtools::install_github("dppalomar/covFactorModel")
factor_model <- factorModel(X, type = "M", econ_fact = F)
cbind(alpha = factor_model$alpha, beta = factor_model$beta)
#> alpha Mkt.RF SMB HML
#> AAPL 1.437845e-04 0.9657612 -0.23339130 -0.49806858
#> AMD 6.181760e-04 1.4062105 0.80738336 -0.07240117
#> ADI -2.285017e-05 1.2124008 0.09025928 -0.20739271
#> ABBV 1.621380e-04 1.0582340 0.02833584 -0.72152627
#> AEZS -4.513235e-03 0.6989534 1.31318108 -0.25160182
#> A 1.146100e-05 1.2181429 0.10370898 -0.20487290
#> APD 6.281504e-05 1.0222936 -0.04394061 0.11060938
#> AA -4.587722e-05 1.3391852 0.62590136 0.99858692
#> CF -5.777426e-04 1.0387867 0.48430007 0.82014523
Let’s now consider statistical factor models or implicit factor models where both the factors and the loadings are not available. Recall the principal factor method for the model \(\mathbf{X}^T = \boldsymbol{\alpha}\mathbf{1}^T + \mathbf{B}\mathbf{F}^T + \mathbf{E}^T\) with \(K\) factors:
K <- 3
alpha <- colMeans(X)
X_ <- X - matrix(alpha, T, N, byrow = TRUE)
Sigma_prev <- matrix(0, N, N)
Sigma <- (1/(T-1)) * t(X_) %*% X_
eigSigma <- eigen(Sigma)
while (norm(Sigma - Sigma_prev, "F")/norm(Sigma, "F") > 1e-3) {
B <- eigSigma$vectors[, 1:K, drop = FALSE] %*% diag(sqrt(eigSigma$values[1:K]), K, K)
Psi <- diag(diag(Sigma - B %*% t(B)))
Sigma_prev <- Sigma
Sigma <- B %*% t(B) + Psi
eigSigma <- eigen(Sigma - Psi)
}
cbind(alpha, B)
#> alpha
#> AAPL 0.0007074564 0.0002732114 -0.004631647 -0.0044814226
#> AMD 0.0013722468 0.0045782146 -0.035202146 0.0114549515
#> ADI 0.0006533116 0.0004151904 -0.007379066 -0.0053058139
#> ABBV 0.0007787929 0.0017513359 -0.003967816 -0.0056000810
#> AEZS -0.0041576357 0.0769496344 0.002935950 0.0006249473
#> A 0.0006902482 0.0012690079 -0.005680162 -0.0061507654
#> APD 0.0006236565 0.0005442926 -0.004229364 -0.0057976394
#> AA 0.0006277163 0.0027405024 -0.009796620 -0.0149177957
#> CF -0.0000573028 0.0023108605 -0.007409061 -0.0153425661
Again, we can simply use the R package covFactorModel to do the work for us:
library(covFactorModel)
factor_model <- factorModel(X, type = "S", K = K, max_iter = 10)
cbind(alpha = factor_model$alpha, beta = factor_model$beta)
#> alpha factor1 factor2 factor3
#> AAPL 0.0007074564 0.0002732114 -0.004631647 -0.0044814226
#> AMD 0.0013722468 0.0045782146 -0.035202146 0.0114549515
#> ADI 0.0006533116 0.0004151904 -0.007379066 -0.0053058139
#> ABBV 0.0007787929 0.0017513359 -0.003967816 -0.0056000810
#> AEZS -0.0041576357 0.0769496344 0.002935950 0.0006249473
#> A 0.0006902482 0.0012690079 -0.005680162 -0.0061507654
#> APD 0.0006236565 0.0005442926 -0.004229364 -0.0057976394
#> AA 0.0006277163 0.0027405024 -0.009796620 -0.0149177957
#> CF -0.0000573028 0.0023108605 -0.007409061 -0.0153425661
We will finally compare the following different factor models:
We will estimate the models during a training phase and then we will compare how well the estimated covariance matrices do compared to the sample covariance matrix of the test phase. The estimation error will be evaluated in terms of the Frobenius norm \(\|\hat{\boldsymbol{\Sigma}} - \boldsymbol{\Sigma}_{\sf true}\|_F\) as well as the PRIAL (PeRcentage Improvement in Average Loss): \[ {\sf PRIAL} = 100\times \frac{\|\boldsymbol{\Sigma}_{\sf scm} - \boldsymbol{\Sigma}_{\sf true}\|_F^2 - \|\hat{\boldsymbol{\Sigma}} - \boldsymbol{\Sigma}_{\sf true}\|_F^2}{\|\boldsymbol{\Sigma}_{\sf scm} - \boldsymbol{\Sigma}_{\sf true}\|_F^2} \] which goes to 0 when the estimation \(\hat{\boldsymbol{\Sigma}}\) tends to the sample covariance matrix \(\boldsymbol{\Sigma}_{\sf scm}\) and goes to 100 when the estimation \(\hat{\boldsymbol{\Sigma}}\) tends to the true covariance matrix \(\boldsymbol{\Sigma}_{\sf true}\).
Let’s load the training and test sets:
library(xts)
library(quantmod)
# set begin-end date and stock namelist
begin_date <- "2013-01-01"
end_date <- "2015-12-31"
stock_namelist <- c("AAPL", "AMD", "ADI", "ABBV", "AEZS", "A", "APD", "AA","CF")
# prepare stock data
data_set <- xts()
for (stock_index in 1:length(stock_namelist))
data_set <- cbind(data_set, Ad(getSymbols(stock_namelist[stock_index],
from = begin_date, to = end_date, auto.assign = FALSE)))
colnames(data_set) <- stock_namelist
indexClass(data_set) <- "Date"
X <- diff(log(data_set), na.pad = FALSE)
N <- ncol(X)
T <- nrow(X)
# prepare Fama-French factors
mydata <- read.csv("F-F_Research_Data_Factors_daily.CSV", skip = 4)
mydata <- mydata[-nrow(mydata), ] # remove last row
fama_lib <- xts(x = mydata[, c(2,3,4)], order.by = as.Date(paste(mydata[, 1]), "%Y%m%d"))
F_FamaFrench <- fama_lib[index(X)]/100
# prepare index
SP500_index <- Ad(getSymbols("^GSPC", from = begin_date, to = end_date, auto.assign = FALSE))
colnames(SP500_index) <- "index"
f_SP500 <- diff(log(SP500_index), na.pad = FALSE)
# split data into training and set data
T_trn <- round(0.45*T)
X_trn <- X[1:T_trn, ]
X_tst <- X[(T_trn+1):T, ]
F_FamaFrench_trn <- F_FamaFrench[1:T_trn, ]
F_FamaFrench_tst <- F_FamaFrench[(T_trn+1):T, ]
f_SP500_trn <- f_SP500[1:T_trn, ]
f_SP500_tst <- f_SP500[(T_trn+1):T, ]
Now let’s estimate the different factor models with the training data:
# sample covariance matrix
Sigma_SCM <- cov(X_trn)
# 1-factor model
F_ <- cbind(ones = 1, f_SP500_trn)
Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X_trn))
colnames(Gamma) <- c("alpha", "beta")
alpha <- Gamma[, 1]
beta <- Gamma[, 2]
E <- xts(t(t(X_trn) - Gamma %*% t(F_)), index(X_trn))
Psi <- (1/(T_trn-2)) * t(E) %*% E
Sigma_SP500 <- as.numeric(var(f_SP500_trn)) * beta %o% beta + diag(diag(Psi))
# Fama-French 3-factor model
F_ <- cbind(ones = 1, F_FamaFrench_trn)
Gamma <- t(solve(t(F_) %*% F_, t(F_) %*% X_trn))
colnames(Gamma) <- c("alpha", "beta1", "beta2", "beta3")
alpha <- Gamma[, 1]
B <- Gamma[, 2:4]
E <- xts(t(t(X_trn) - Gamma %*% t(F_)), index(X_trn))
Psi <- (1/(T_trn-4)) * t(E) %*% E
Sigma_FamaFrench <- B %*% cov(F_FamaFrench_trn) %*% t(B) + diag(diag(Psi))
# Statistical 1-factor model
K <- 1
alpha <- colMeans(X_trn)
X_trn_ <- X_trn - matrix(alpha, T_trn, N, byrow = TRUE)
Sigma_prev <- matrix(0, N, N)
Sigma <- (1/(T_trn-1)) * t(X_trn_) %*% X_trn_
eigSigma <- eigen(Sigma)
while (norm(Sigma - Sigma_prev, "F")/norm(Sigma, "F") > 1e-3) {
B <- eigSigma$vectors[, 1:K, drop = FALSE] %*% diag(sqrt(eigSigma$values[1:K]), K, K)
Psi <- diag(diag(Sigma - B %*% t(B)))
Sigma_prev <- Sigma
Sigma <- B %*% t(B) + Psi
eigSigma <- eigen(Sigma - Psi)
}
Sigma_PCA1 <- Sigma
# Statistical 3-factor model
K <- 3
alpha <- colMeans(X_trn)
X_trn_ <- X_trn - matrix(alpha, T_trn, N, byrow = TRUE)
Sigma_prev <- matrix(0, N, N)
Sigma <- (1/(T_trn-1)) * t(X_trn_) %*% X_trn_
eigSigma <- eigen(Sigma)
while (norm(Sigma - Sigma_prev, "F")/norm(Sigma, "F") > 1e-3) {
B <- eigSigma$vectors[, 1:K] %*% diag(sqrt(eigSigma$values[1:K]), K, K)
Psi <- diag(diag(Sigma - B %*% t(B)))
Sigma_prev <- Sigma
Sigma <- B %*% t(B) + Psi
eigSigma <- eigen(Sigma - Psi)
}
Sigma_PCA3 <- Sigma
# Statistical 5-factor model
K <- 5
alpha <- colMeans(X_trn)
X_trn_ <- X_trn - matrix(alpha, T_trn, N, byrow = TRUE)
Sigma_prev <- matrix(0, N, N)
Sigma <- (1/(T_trn-1)) * t(X_trn_) %*% X_trn_
eigSigma <- eigen(Sigma)
while (norm(Sigma - Sigma_prev, "F")/norm(Sigma, "F") > 1e-3) {
B <- eigSigma$vectors[, 1:K] %*% diag(sqrt(eigSigma$values[1:K]), K, K)
Psi <- diag(diag(Sigma - B %*% t(B)))
Sigma_prev <- Sigma
Sigma <- B %*% t(B) + Psi
eigSigma <- eigen(Sigma - Psi)
}
Sigma_PCA5 <- Sigma
Finally, let’s compare the different estimations in the test data:
Sigma_true <- cov(X_tst)
error <- c(SCM = norm(Sigma_SCM - Sigma_true, "F"),
SP500 = norm(Sigma_SP500 - Sigma_true, "F"),
FamaFrench = norm(Sigma_FamaFrench - Sigma_true, "F"),
PCA1 = norm(Sigma_PCA1 - Sigma_true, "F"),
PCA3 = norm(Sigma_PCA3 - Sigma_true, "F"),
PCA5 = norm(Sigma_PCA5 - Sigma_true, "F"))
print(error)
#> SCM SP500 FamaFrench PCA1 PCA3 PCA5
#> 0.007105459 0.007100631 0.007089249 0.007139017 0.007100200 0.007103374
barplot(error, main = "Error in estimation of covariance matrix",
col = "aquamarine3", cex.names = 0.75, las = 1)
ref <- norm(Sigma_SCM - Sigma_true, "F")^2
PRIAL <- 100*(ref - error^2)/ref
print(PRIAL)
#> SCM SP500 FamaFrench PCA1 PCA3 PCA5
#> 0.00000000 0.13586162 0.45574782 -0.94681006 0.14797930 0.05867364
barplot(PRIAL, main = "PRIAL for estimation of covariance matrix",
col = "bisque3", cex.names = 0.75, las = 1)
đŸ‘‰ The final conclusion is that using factor models for covariance matrix estimation may help. In addition, it is not clear that using explicit factors helps as compared to the statistical factor modeling.