-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
246 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# This package will contain the spiders of your Scrapy project | ||
# | ||
# Please refer to the documentation for information on how to create and manage | ||
# your spiders. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
from string import ascii_lowercase | ||
|
||
import scrapy | ||
from scrapy.spiders import CrawlSpider | ||
from w3lib.html import remove_tags, remove_tags_with_content | ||
|
||
|
||
class InvestopediaSpider(CrawlSpider): | ||
name = 'investopedia' | ||
start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1'] | ||
|
||
def parse(self, response): | ||
""" | ||
Parse the response page | ||
""" | ||
url = response.url | ||
|
||
# 'terms' has to be there in the URL to proceed further | ||
if 'terms' not in url: | ||
return | ||
|
||
# if the url ends with '.asp', then that's a topic page | ||
if url.endswith('.asp'): | ||
return self._parse_topic_response(response) | ||
|
||
# Otherwise, assume that this a list page | ||
return self._parse_topic_list(response) | ||
|
||
def _parse_topic_response(self, response): | ||
""" | ||
Parses various topics | ||
e.g. www.investopedia.com/terms/o/oddlottheory.asp | ||
""" | ||
# Get the title first | ||
title = response.css('title::text').extract_first() | ||
|
||
# Replace / with a space - creates issues with writing to file | ||
title = title.replace('/', ' ') | ||
|
||
# Get the first div with id Content | ||
content = response.css('div#Content')[0] | ||
content = content.css('div.content-box') | ||
|
||
text = '' | ||
for child in content.xpath('//p'): | ||
|
||
# Get the text from this child <p></p> tag | ||
paragraph = child.extract() | ||
|
||
# Remove tags including <p> and <a> | ||
paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip() | ||
|
||
# Replace '&' with '&' | ||
paragraph = paragraph.replace('&', '&') | ||
|
||
# Replace 'U.S.' with 'US': | ||
paragraph = paragraph.replace('U.S.', 'US') | ||
|
||
# Some more replacements to improve the default tokenization | ||
for c in '();.,[]"\'-:/%$+@?': | ||
paragraph = paragraph.replace(c, ' {} '.format(c)) | ||
|
||
# Add to the file | ||
text += paragraph.lower() + '\n' | ||
|
||
# Save the title and the text both | ||
filename = 'investopedia_data.txt' | ||
f = open(filename, 'a') | ||
f.write(text) | ||
f.close() | ||
|
||
def _parse_topic_list(self, response): | ||
""" | ||
Parse the page with the topics listed out | ||
e.g. www.investopedia.com/terms/o/ | ||
""" | ||
list_element = response.css('ol.list') | ||
|
||
# Iterate through the list of topics | ||
for l in list_element.css('li'): | ||
# Extract the URL | ||
url = l.css('a::attr(href)').extract_first() | ||
|
||
next_page = response.urljoin(url) | ||
yield scrapy.Request(next_page, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import json | ||
import re | ||
|
||
from scrapy.spiders import CrawlSpider | ||
from w3lib.html import remove_tags, remove_tags_with_content | ||
|
||
|
||
class QplumSpider(CrawlSpider): | ||
name = 'qplum' | ||
start_urls = ['https://www.qplum.co/articles/{}.json'.format(i) for i in range(300)] | ||
|
||
def parse(self, response): | ||
""" | ||
Parse the response page | ||
""" | ||
# Skip error URLs | ||
if response.status != 200: | ||
return | ||
|
||
data = json.loads(response.text) | ||
data = data['content'] | ||
|
||
# Remove <script>, <sup>, <math> tags with the content | ||
paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style')) | ||
# Remove the rest of the tags without removing the content | ||
paragraph = remove_tags(paragraph) | ||
|
||
# Replace & with & | ||
paragraph = paragraph.replace('&', '&') | ||
# Replace ' with ' | ||
paragraph = paragraph.replace(''', "'") | ||
paragraph = paragraph.replace('’', "'") | ||
paragraph = paragraph.replace('“', "'") | ||
paragraph = paragraph.replace('”', "'") | ||
# Replace with a space | ||
paragraph = re.sub("&.....;", ' ', paragraph) | ||
paragraph = re.sub("&....;", ' ', paragraph) | ||
|
||
# Replace 'U.S.' with 'US': | ||
paragraph = paragraph.replace('U.S.', 'US') | ||
|
||
# Some more replacements to improve the default tokenization | ||
for c in ['\n', '\r', '\t']: | ||
paragraph = paragraph.replace(c, ' ') | ||
for c in '();.,[]"\'-:/%$+@?': | ||
paragraph = paragraph.replace(c, ' {} '.format(c)) | ||
|
||
filename = 'qplum_data.txt' | ||
f = open(filename, 'a') | ||
f.write(paragraph.lower() + '\n') | ||
f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import scrapy | ||
from scrapy.spiders import CrawlSpider | ||
from w3lib.html import remove_tags, remove_tags_with_content | ||
|
||
|
||
class WikipediaSpider(CrawlSpider): | ||
name = 'wikipedia' | ||
start_urls = ['https://en.wikipedia.org/wiki/Outline_of_finance'] | ||
|
||
def parse(self, response): | ||
""" | ||
Parse the response page | ||
""" | ||
url = response.url | ||
|
||
if url in WikipediaSpider.start_urls: | ||
return self._parse_topic_list(response) | ||
|
||
else: | ||
self.parse_topic_response(response) | ||
return self._parse_links(response) | ||
|
||
def parse_topic_response(self, response): | ||
""" | ||
Parse the content | ||
""" | ||
|
||
# Get the title first | ||
title = response.css('title::text').extract_first() | ||
|
||
# Replace / with a space - creates issues with writing to file | ||
title = title.replace('/', ' ') | ||
|
||
content = response.css('div#mw-content-text') | ||
|
||
# Just extract all the '<p></p>' children from this | ||
text = '' | ||
for child in content.xpath('//p'): | ||
|
||
# Get the text from this child <p></p> tag | ||
paragraph = child.extract() | ||
|
||
# Remove <script>, <sup>, <math> tags with the content | ||
paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math')) | ||
# Remove the rest of the tags without removing the content | ||
paragraph = remove_tags(paragraph) | ||
|
||
# Replace '&' with '&' | ||
paragraph = paragraph.replace('&', '&') | ||
|
||
# Replace 'U.S.' with 'US': | ||
paragraph = paragraph.replace('U.S.', 'US') | ||
|
||
# Some more replacements to improve the default tokenization | ||
for c in '();.,[]"\'-:/%$+@?': | ||
paragraph = paragraph.replace(c, ' {} '.format(c)) | ||
|
||
# Add to the file | ||
text += paragraph.lower() + '\n' | ||
|
||
filename = 'wiki_data.txt' | ||
f = open(filename, 'a') | ||
f.write(text) | ||
f.close() | ||
|
||
def _parse_links(self, response): | ||
""" | ||
Parses the links from the first level of pages | ||
""" | ||
content = response.css('div#mw-content-text') | ||
|
||
for child in content.xpath('//p'): | ||
# Extract the URLs | ||
urls = child.css('a::attr(href)').extract() | ||
|
||
for url in urls: | ||
if url is None or 'wiki' not in url: | ||
continue | ||
|
||
next_page = response.urljoin(url) | ||
yield scrapy.Request(next_page, callback=self.parse_topic_response) | ||
|
||
def _parse_topic_list(self, response): | ||
""" | ||
Parse various topics from the list of topics | ||
""" | ||
|
||
# All of the links on this pages are in the bullet points | ||
# Therefore, extract the 'ul' tags to get the list | ||
content = response.css('div#mw-content-text') | ||
lists = content.css('ul') | ||
|
||
# Iterate through each list | ||
for ul in lists: | ||
|
||
# Iterate through each list item | ||
for l in ul.css('li'): | ||
# Extract the URL | ||
url = l.css('a::attr(href)').extract_first() | ||
|
||
# Skip external links as well as the links to the same page (e.g. TOC) | ||
if url is None or 'wiki' not in url: | ||
continue | ||
|
||
next_page = response.urljoin(url) | ||
yield scrapy.Request(next_page, callback=self.parse) |