-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubscraper
32 lines (25 loc) · 925 Bytes
/
subscraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import requests
from bs4 import BeautifulSoup
import time
def scrape_wiki_articles(url):
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
title = soup.find(id="firstHeading")
body = soup.find('p').getText()
linkToScrape = 0
print(title.text)
articleTitles = open("savedArticles.txt", "a+")
articleTitles.write(title.text + " - " + body)
articleTitles.close()
time.sleep(2)
for link in soup.find(id="bodyContent").find_all('a', href=True):
if "/wiki/" in link['href']:
print(title.text + " ")
url = "https://en.wikipedia.org" + link['href']
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
title = soup.find(id="firstHeading")
body = soup.find('p').getText()
articleTitles = open("savedArticles.txt", "a+")
articleTitles.write(title.text + " - " + body)
articleTitles.close()
time.sleep(2)
scrape_wiki_articles("https://en.wikipedia.org/wiki/GitHub")