#### scrapeNYT_API2.R
#### cengel - 10/30/14

library(RJSONIO)
library (RCurl)

### set parameters ###
api <- "XXXXXXX" #<<<<<<<<<<<<<===== API key goes here
q <- "ebola+outbreak" # Query string, use + instead of space
records <- 500 #how many results do we want? (Note limitations)
pageRange <- 0:(records/10-1)

# get data 
dat <- c()

for (i in pageRange) {
   # concatenate URL for each page
   uri <- paste0("http://api.nytimes.com/svc/search/v2/articlesearch.json?q=", q, "&page=", i, "&fl=pub_date&api-key=", api)
   d <- getURL(uri)
   res <- fromJSON(d,simplify = FALSE)
   dat <- append(dat, unlist(res$response$docs))  # convert the dates to a vector and append
 }

# establish date range
dat.conv <- strptime(dat, format="%Y-%m-%d") # need to convert dat into POSIX format
daterange <- c(min(dat.conv), max(dat.conv))
dat.all <- seq(daterange[1], daterange[2], by="day") # all possible days

# aggregate counts for dates and coerce into a data frame
cts <- as.data.frame(table(dat))

# compare dates from counts dataframe with the whole data range
# assign 0 where there is no count, otherwise take count
# (take out PSD at the end to make it comparable)
dat.all <- strptime(dat.all, format="%Y-%m-%d")
# can't seem to be able to compare Posix objects with %in%, so coerce them to character for this:
freqs <- ifelse(as.character(dat.all) %in% as.character(strptime(cts$dat, format="%Y-%m-%d")), cts$Freq, 0)

plot (freqs, type="l", xaxt="n", main=paste("Search term(s):",q), ylab="# of articles", xlab="date")
axis(1, 1:length(freqs), dat.all)
lines(lowess(freqs, f=.2), col = 2)