Code
library(dplyr)
library(tidyverse)
library(readxl)
library(tidyquant)
library(glue)
library(tidyr)
library(lubridate)
library(dplyr)
library(tidyverse)
library(readxl)
library(tidyquant)
library(glue)
library(tidyr)
library(lubridate)
= read.csv("daily-treasury-rates-2021.csv")
APR21 = read.csv("daily-treasury-rates-2022.csv")
APR22 = read.csv("daily-treasury-rates-2023.csv")
APR23 = bind_rows(APR21,APR22,APR23)
APR head(APR)
Date X1.Mo X2.Mo X3.Mo X6.Mo X1.Yr X2.Yr X3.Yr X5.Yr X7.Yr X10.Yr
1 12/31/2021 0.06 0.05 0.06 0.19 0.39 0.73 0.97 1.26 1.44 1.52
2 12/30/2021 0.06 0.06 0.05 0.19 0.38 0.73 0.98 1.27 1.44 1.52
3 12/29/2021 0.01 0.02 0.05 0.19 0.38 0.75 0.99 1.29 1.47 1.55
4 12/28/2021 0.03 0.04 0.06 0.20 0.39 0.74 0.99 1.27 1.41 1.49
5 12/27/2021 0.04 0.05 0.06 0.21 0.33 0.76 0.98 1.26 1.41 1.48
6 12/23/2021 0.04 0.05 0.07 0.18 0.31 0.71 0.97 1.25 1.42 1.50
X20.Yr X30.Yr X4.Mo
1 1.94 1.90 NA
2 1.97 1.93 NA
3 2.00 1.96 NA
4 1.94 1.90 NA
5 1.92 1.88 NA
6 1.94 1.91 NA
Downloaded sp500 portfolio from https://www.slickcharts.com/sp500 as sp500rank.xlsx
<- read_excel("sp500rank.xlsx")
sp500rank = unlist(sp500rank['Symbol'])
sp500rank_list = sp500rank_list[1:105]
top_105_symbols = rev(rev(sp500rank_list)[1:105]) last_105_symbols
The stock symbols are then used to download 20 year daily movement from yahoo finance
options("getSymbols.warning4.0"=FALSE)
options("getSymbols.yahoo.warning"=FALSE)
#getSymbols(top_105_symbols, from = '2003-10-30', to = "2023-10-30",warnings = FALSE)
#for (symbol in top_105_symbols) {
# if (exists(symbol)) {
# write.zoo(get(symbol), glue('data/{symbol}.csv'), sep = ",")
# }
#}
#getSymbols(last_105_symbols, from = '2003-10-30', to = "2023-10-30",warnings = FALSE)
#for (symbol in last_105_symbols) {
# if (exists(symbol)) {
# write.zoo(get(symbol), glue('data/{symbol}.csv'), sep = ",")
# }
#}
getSymbols('^GSPC', from = '2003-10-30', to = "2023-10-30",warnings = FALSE)
[1] "GSPC"
write.zoo(GSPC, 'data/GSPC.csv', sep = ",")
Data collected:
<- APR |> column_to_rownames("Date")
df <- df |> rownames_to_column("id") |> pivot_longer(cols = -id) |> mutate(missing = ifelse(is.na(value),"yes","no"))
df <- df |> mutate(id = as.Date(id, format = "%m/%d/%Y"))
df ggplot(df, aes(x=name, y=id, fill = missing)) +
geom_tile()
Before 10/18/2022 there are no 4 months interest rates.
<- as.Date("2003-10-29")
start_date <- as.Date("2023-11-05")
end_date <- seq(start_date, end_date, by = "1 day")
date_sequence
<- data.frame(Date = date_sequence)
df $Week_Start_Date <- floor_date(df$Date, unit = "week")
df<- unique(df$Week_Start_Date)
unique_week_start_dates
<- data.frame(
template_df Sunday = rep(0, length(unique_week_start_dates)),
Monday = rep(0, length(unique_week_start_dates)),
Tuesday = rep(0, length(unique_week_start_dates)),
Wednesday = rep(0, length(unique_week_start_dates)),
Thursday = rep(0, length(unique_week_start_dates)),
Friday = rep(0, length(unique_week_start_dates)),
Saturday = rep(0, length(unique_week_start_dates))
)
<- cbind.data.frame(Week_Start_Date = unique_week_start_dates, template_df)
result_df
<- result_df %>%
result_df select(Week_Start_Date, Sunday, Monday, Tuesday, Wednesday, Thursday, Friday, Saturday)
<- result_df
sp500_missing <- as.Date(unlist(read.csv("data/GSPC.csv")['Index']))
sp500_dates for (date in sp500_dates) {
<- as.Date(date)
date <- floor_date(date, unit = "week")
week_start <- weekdays(date)
day_of_week <- match(day_of_week, c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
day_of_week $Week_Start_Date == week_start, day_of_week] <- 1
sp500_missing[sp500_missing }
<- result_df
meta_missing <- as.Date(unlist(read.csv("data/META.csv")['Index']))
meta_dates for (date in meta_dates) {
<- as.Date(date)
date <- floor_date(date, unit = "week")
week_start <- weekdays(date)
day_of_week <- match(day_of_week, c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
day_of_week $Week_Start_Date == week_start, day_of_week] <- 1
meta_missing[meta_missing }
<- pivot_longer(sp500_missing, cols = -Week_Start_Date, names_to = "Day", values_to = "Value")
long_df
ggplot(long_df, aes(x = Week_Start_Date, y = Day, fill = factor(Value))) +
geom_tile() +
scale_fill_manual(values = c("1" = "cornflowerblue", "0" = "red")) +
labs(fill = "Value", x = "Week Start Date", y = "Day of the Week", title = "S&P500") +
coord_flip()
<- pivot_longer(meta_missing, cols = -Week_Start_Date, names_to = "Day", values_to = "Value")
long_df
ggplot(long_df, aes(x = Week_Start_Date, y = Day, fill = factor(Value))) +
geom_tile() +
scale_fill_manual(values = c("1" = "cornflowerblue", "0" = "red")) +
labs(fill = "Value", x = "Week Start Date", y = "Day of the Week", title = "META") +
coord_flip()
There is no missing value in the stock data. However, due to some stocks entering the market late, earlier data does not exist for those stocks.
While there is no missing data, market closes during holidays, so some dates will miss, but the data from Yahoo is accurate to use.