Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

R textConnection #48

Open
AdamZ2 opened this issue Sep 27, 2018 · 1 comment
Open

R textConnection #48

AdamZ2 opened this issue Sep 27, 2018 · 1 comment

Comments

@AdamZ2
Copy link

AdamZ2 commented Sep 27, 2018

I am trying to use the folowing code to download some forms from (https://www.sec.gov/Archives/edgar/full-index/ ).
I am getting this error :
Error in fread(textConnection(master), sep = "|") :
input= must be a single character string containing a file name, a system command containing at least one space, a URL starting 'http[s]://', 'ftp[s]://' or 'file://', or, the input data itself containing at least one \n or \r

this the code
start_year <- 2016
start_QTR <- 1

end_year <- 2016
end_QTR <- 4
setwd("D:/Requests/SC13D")
require(data.table)
require(Hmisc)
require(data.table)

###########################################
####### construct SEC master file ########
###########################################
qtr.master.file <- function(year, QTR)
{
require(data.table)
name <- paste0("https://www.sec.gov/Archives/edgar/full-index/", year,"/QTR",QTR,"/master.idx")
print(sprintf("Downloading master file for quarter %d of year %s...", QTR, year))
master <- readLines(url(name))
master <- master[grep("SC 13(D|G)", master)]
master <- gsub("#", "", master) # R does not treat a comment sign well
master_table <- fread(textConnection(master), sep = "|")
rm(master)
colnames(master_table) <- c("cik", "name", "type", "date", "link")
master_table <- as.data.table(master_table)
master_table[, link := paste0("https://www.sec.gov/Archives/", link)]
master_table[, file := gsub(".*/", "", link)]
closeAllConnections()
return(master_table)
}
###########################################

download all files into temp dir

###########################################
dwnld.files <- function(master)
{
require(RCurl)
dir.create("temp_dir")
master <- as.data.frame(master)
master <- master[!duplicated(master$file),]
for(j in 1:length(master$file))
{
file <- NA
file_url <- as.character(master$link[j])
file_name <- paste0("./temp_dir/",master$file[j])
try(file <- getURL(file_url))
write(file, file_name)
}
}
###########################################
####### put all forms in SQdatabase #######
###########################################
put.files.in.sql <- function(dbname)
{
library(DBI)
library(RSQLite)
together <- function(x)
{
return(paste(x, collapse = "\n"))
}

con = dbConnect(SQLite(), dbname=dbname)
dbSendQuery(conn=con,
"CREATE TABLE compsubm
(FILENAME TEXT, COMLSUBFILE TEXT)")
path <- paste0("./temp_dir/")
files <- list.files(path)
n <- length(files)
step <- 500
for(i in 1:(n %/% step + 1))
{
start <- 1 + (i-1)step
end <- i
(step)
ind <- start:min(end,n)
objects <- lapply(paste0(path,files[ind]), readLines)
clean <- lapply(objects, together)
data <- NULL
data$FILENAME <- files[ind]
data <- as.data.frame(data)
data$COMLSUBFILE <- unlist(clean)
dbWriteTable(conn=con, name = "compsubm", data, append = T)
}
dbDisconnect(con)
unlink("temp_dir", recursive = T)
}
get_dates <- function(start_year, start_QTR, end_year, end_QTR)
{
require(data.table)
all_dates <- data.table(year = rep(1993:2050, 4))
setkey(all_dates,year)
all_dates[, QTR := 1:.N, by = year]
all_dates <- as.data.frame(all_dates)

x <- paste0(all_dates$year, all_dates$QTR) >= paste0(start_year, start_QTR) & paste0(all_dates$year, all_dates$QTR) <= paste0(end_year, end_QTR)
return(all_dates[x,])
}

dates <- get_dates(start_year, start_QTR, end_year, end_QTR)
for(i in 1:length(dates$QTR))
{
print(Sys.time())
master <- qtr.master.file(dates$year[i], dates$QTR[i])
write.csv(master, paste0("./Master/master_", dates$year[i], dates$QTR[i], ".csv"), row.names = F)
print("Dowloading files, it takes up to 4 hours")
dwnld.files(master)
print("Putting all files into SQL & cleaning")
put.files.in.sql(paste0("./Forms/",dates$year[i],dates$QTR[i], ".sqlite"))
}

@Jennyyanhuadong
Copy link

I find that your code is similar to https://github.com/volkovacodes/Block_Codes/blob/master/1.%20download_forms.R
The code there uses csv, and then separate. I tried the code, and it works.
write(master, "tmp.csv")
master_table <- fread("tmp.csv", sep = "|")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants