Skip to content

Commit

Permalink
Adding the crawler code
Browse files Browse the repository at this point in the history
  • Loading branch information
hardikp committed Jun 29, 2017
1 parent a0ed5dc commit 936f4df
Show file tree
Hide file tree
Showing 4 changed files with 246 additions and 0 deletions.
4 changes: 4 additions & 0 deletions scrapy-spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
85 changes: 85 additions & 0 deletions scrapy-spiders/investopedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from string import ascii_lowercase

import scrapy
from scrapy.spiders import CrawlSpider
from w3lib.html import remove_tags, remove_tags_with_content


class InvestopediaSpider(CrawlSpider):
name = 'investopedia'
start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1']

def parse(self, response):
"""
Parse the response page
"""
url = response.url

# 'terms' has to be there in the URL to proceed further
if 'terms' not in url:
return

# if the url ends with '.asp', then that's a topic page
if url.endswith('.asp'):
return self._parse_topic_response(response)

# Otherwise, assume that this a list page
return self._parse_topic_list(response)

def _parse_topic_response(self, response):
"""
Parses various topics
e.g. www.investopedia.com/terms/o/oddlottheory.asp
"""
# Get the title first
title = response.css('title::text').extract_first()

# Replace / with a space - creates issues with writing to file
title = title.replace('/', ' ')

# Get the first div with id Content
content = response.css('div#Content')[0]
content = content.css('div.content-box')

text = ''
for child in content.xpath('//p'):

# Get the text from this child <p></p> tag
paragraph = child.extract()

# Remove tags including <p> and <a>
paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()

# Replace '&amp;' with '&'
paragraph = paragraph.replace('&amp;', '&')

# Replace 'U.S.' with 'US':
paragraph = paragraph.replace('U.S.', 'US')

# Some more replacements to improve the default tokenization
for c in '();.,[]"\'-:/%$+@?':
paragraph = paragraph.replace(c, ' {} '.format(c))

# Add to the file
text += paragraph.lower() + '\n'

# Save the title and the text both
filename = 'investopedia_data.txt'
f = open(filename, 'a')
f.write(text)
f.close()

def _parse_topic_list(self, response):
"""
Parse the page with the topics listed out
e.g. www.investopedia.com/terms/o/
"""
list_element = response.css('ol.list')

# Iterate through the list of topics
for l in list_element.css('li'):
# Extract the URL
url = l.css('a::attr(href)').extract_first()

next_page = response.urljoin(url)
yield scrapy.Request(next_page, callback=self.parse)
51 changes: 51 additions & 0 deletions scrapy-spiders/qplum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import json
import re

from scrapy.spiders import CrawlSpider
from w3lib.html import remove_tags, remove_tags_with_content


class QplumSpider(CrawlSpider):
name = 'qplum'
start_urls = ['https://www.qplum.co/articles/{}.json'.format(i) for i in range(300)]

def parse(self, response):
"""
Parse the response page
"""
# Skip error URLs
if response.status != 200:
return

data = json.loads(response.text)
data = data['content']

# Remove <script>, <sup>, <math> tags with the content
paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style'))
# Remove the rest of the tags without removing the content
paragraph = remove_tags(paragraph)

# Replace &amp; with &
paragraph = paragraph.replace('&amp;', '&')
# Replace &#39; with '
paragraph = paragraph.replace('&#39;', "'")
paragraph = paragraph.replace('&rsquo;', "'")
paragraph = paragraph.replace('&ldquo;', "'")
paragraph = paragraph.replace('&rdquo;', "'")
# Replace &nbsp; with a space
paragraph = re.sub("&.....;", ' ', paragraph)
paragraph = re.sub("&....;", ' ', paragraph)

# Replace 'U.S.' with 'US':
paragraph = paragraph.replace('U.S.', 'US')

# Some more replacements to improve the default tokenization
for c in ['\n', '\r', '\t']:
paragraph = paragraph.replace(c, ' ')
for c in '();.,[]"\'-:/%$+@?':
paragraph = paragraph.replace(c, ' {} '.format(c))

filename = 'qplum_data.txt'
f = open(filename, 'a')
f.write(paragraph.lower() + '\n')
f.close()
106 changes: 106 additions & 0 deletions scrapy-spiders/wikipedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import scrapy
from scrapy.spiders import CrawlSpider
from w3lib.html import remove_tags, remove_tags_with_content


class WikipediaSpider(CrawlSpider):
name = 'wikipedia'
start_urls = ['https://en.wikipedia.org/wiki/Outline_of_finance']

def parse(self, response):
"""
Parse the response page
"""
url = response.url

if url in WikipediaSpider.start_urls:
return self._parse_topic_list(response)

else:
self.parse_topic_response(response)
return self._parse_links(response)

def parse_topic_response(self, response):
"""
Parse the content
"""

# Get the title first
title = response.css('title::text').extract_first()

# Replace / with a space - creates issues with writing to file
title = title.replace('/', ' ')

content = response.css('div#mw-content-text')

# Just extract all the '<p></p>' children from this
text = ''
for child in content.xpath('//p'):

# Get the text from this child <p></p> tag
paragraph = child.extract()

# Remove <script>, <sup>, <math> tags with the content
paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math'))
# Remove the rest of the tags without removing the content
paragraph = remove_tags(paragraph)

# Replace '&amp;' with '&'
paragraph = paragraph.replace('&amp;', '&')

# Replace 'U.S.' with 'US':
paragraph = paragraph.replace('U.S.', 'US')

# Some more replacements to improve the default tokenization
for c in '();.,[]"\'-:/%$+@?':
paragraph = paragraph.replace(c, ' {} '.format(c))

# Add to the file
text += paragraph.lower() + '\n'

filename = 'wiki_data.txt'
f = open(filename, 'a')
f.write(text)
f.close()

def _parse_links(self, response):
"""
Parses the links from the first level of pages
"""
content = response.css('div#mw-content-text')

for child in content.xpath('//p'):
# Extract the URLs
urls = child.css('a::attr(href)').extract()

for url in urls:
if url is None or 'wiki' not in url:
continue

next_page = response.urljoin(url)
yield scrapy.Request(next_page, callback=self.parse_topic_response)

def _parse_topic_list(self, response):
"""
Parse various topics from the list of topics
"""

# All of the links on this pages are in the bullet points
# Therefore, extract the 'ul' tags to get the list
content = response.css('div#mw-content-text')
lists = content.css('ul')

# Iterate through each list
for ul in lists:

# Iterate through each list item
for l in ul.css('li'):
# Extract the URL
url = l.css('a::attr(href)').extract_first()

# Skip external links as well as the links to the same page (e.g. TOC)
if url is None or 'wiki' not in url:
continue

next_page = response.urljoin(url)
yield scrapy.Request(next_page, callback=self.parse)

0 comments on commit 936f4df

Please sign in to comment.