This tutorial outlines how to extract google news with R programming language. It is useful when you need to show newsletter of the topic you are interested to see in the dashboard. In Google news you can search news with the keywords of your interest.
Make sure to install rvest, dplyr and xml2
R packages before running the following script. The script returns the following columns (information).
- Title : Headline of the article
- Link : URL of the article
- Description : 1 or 2 lines summary of the article
- Source : Name of the Original Content Creator
- Time : When article was published
news <- function(term) {
require(dplyr)
require(xml2)
require(rvest)
html_dat <- read_html(paste0("https://news.google.com/search?q=",term,"&hl=en-IN&gl=IN&ceid=US%3Aen"))
dat <- data.frame(Link = html_dat %>%
html_nodes('.VDXfz') %>%
html_attr('href')) %>%
mutate(Link = gsub("./articles/","https://news.google.com/articles/",Link))
news_dat <- data.frame(
Title = html_dat %>%
html_nodes('.DY5T1d') %>%
html_text(),
Link = dat$Link,
Description = html_dat %>%
html_nodes('.Rai5ob') %>%
html_text()
)
# Extract Source and Time (To avoid missing content)
prod <- html_nodes(html_dat, ".SVJrMe")
Source <- lapply(prod, function(x) {
norm <- tryCatch(html_node(x, "a") %>% html_text() ,
error=function(err) {NA})
})
time <- lapply(prod, function(x) {
norm <- tryCatch(html_node(x, "time") %>% html_text(),
error=function(err) {NA})
})
mydf <- data.frame(Source = do.call(rbind, Source), Time = do.call(rbind, time), stringsAsFactors = F)
dff <- cbind(news_dat, mydf) %>% distinct(Time, .keep_all = TRUE)
return(dff)
}
newsdf <- news('indian"%20economy')
%20 refers to space between the two words.