-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathunit-tests.r
44 lines (32 loc) · 1.96 KB
/
unit-tests.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
setwd("~/git/scraping")
# install current version of R package
library(devtools)
document("SCRAP")
install("SCRAP")
library(SCRAP)
# test downloading a few URLs
urls <- c("http://www.spiegel.de/politik/ausland/scott-pruitt-und-donald-trump-wer-ist-der-neue-chef-der-epa-a-1135234.html",
"http://www.sueddeutsche.de/politik/sicherheitskonferenz-trump-der-unsichtbare-elefant-1.3385139",
"http://www.focus.de/wissen/weltraum/gefahr-aus-dem-all-droht-ein-schwarzes-loch-die-erde-zu-verschlingen_id_6658237.html",
"http://www.bild.de/politik/inland/muenchner-sicherheitskonferenz/russland-aussenminister-lawrow-auf-muenchener-sicherheitskonferenz-50493950.bild.html",
"http://www.zeit.de/politik/deutschland/2017-02/afd-bjoern-hoecke-parteiausschluss-dresden-entschuldigung",
"https://www.welt.de/wirtschaft/article162190193/Merkel-kontert-Kritik-an-Exportstaerke-mit-kleinem-Scherz.html",
"http://www.t-online.de/lifestyle/gesundheit/id_80399498/umweltministerium-verbietet-fleisch-und-fisch-fuer-seine-gaeste.html")
# 1) take lists of URLs and classify as news or not
urls_df <- classify_urls(urls)
urls_df <- urls_df[urls_df$isNews==TRUE,]
# 2) download URLs
urls_df$filename <- paste0("test/urls/", gsub("/", "_", urls_df$url_full))
for (i in 1:nrow(urls_df)){
download_url(urls_df$url_full[i], urls_df$filename[i])
}
# 3) parse URLs
urls_df$output <- paste0("test/urls_parsed/", gsub("/", "_", urls_df$url_full), ".txt")
for (i in 1:nrow(urls_df)){
parse_html(urls_df$filename[i], urls_df$output[i])
}
#POSSIBLE ERROR
#main_text contains extra text besides article.
#Zeit_online contains only half of the article.
main_text <- ArticleExtractor(html_text(read_html("http://money.cnn.com/2017/05/31/news/economy/china-europe-eu-trump-us-trade/index.html")),asText=TRUE)
zeit_online <- ArticleExtractor("http://www.zeit.de/politik/ausland/2017-05/donald-trump-pariser-klimabkommen-ausstieg-deutschland-verantwortung",asText = FALSE)