From 936f4df16dd9229823868d8689b9833e1d6ef21c Mon Sep 17 00:00:00 2001 From: Hardik Patel Date: Thu, 29 Jun 2017 16:02:12 -0400 Subject: [PATCH] Adding the crawler code --- scrapy-spiders/__init__.py | 4 ++ scrapy-spiders/investopedia.py | 85 ++++++++++++++++++++++++++ scrapy-spiders/qplum.py | 51 ++++++++++++++++ scrapy-spiders/wikipedia.py | 106 +++++++++++++++++++++++++++++++++ 4 files changed, 246 insertions(+) create mode 100644 scrapy-spiders/__init__.py create mode 100644 scrapy-spiders/investopedia.py create mode 100644 scrapy-spiders/qplum.py create mode 100644 scrapy-spiders/wikipedia.py diff --git a/scrapy-spiders/__init__.py b/scrapy-spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/scrapy-spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scrapy-spiders/investopedia.py b/scrapy-spiders/investopedia.py new file mode 100644 index 0000000..7c0f987 --- /dev/null +++ b/scrapy-spiders/investopedia.py @@ -0,0 +1,85 @@ +from string import ascii_lowercase + +import scrapy +from scrapy.spiders import CrawlSpider +from w3lib.html import remove_tags, remove_tags_with_content + + +class InvestopediaSpider(CrawlSpider): + name = 'investopedia' + start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1'] + + def parse(self, response): + """ + Parse the response page + """ + url = response.url + + # 'terms' has to be there in the URL to proceed further + if 'terms' not in url: + return + + # if the url ends with '.asp', then that's a topic page + if url.endswith('.asp'): + return self._parse_topic_response(response) + + # Otherwise, assume that this a list page + return self._parse_topic_list(response) + + def _parse_topic_response(self, response): + """ + Parses various topics + e.g. www.investopedia.com/terms/o/oddlottheory.asp + """ + # Get the title first + title = response.css('title::text').extract_first() + + # Replace / with a space - creates issues with writing to file + title = title.replace('/', ' ') + + # Get the first div with id Content + content = response.css('div#Content')[0] + content = content.css('div.content-box') + + text = '' + for child in content.xpath('//p'): + + # Get the text from this child

tag + paragraph = child.extract() + + # Remove tags including

and + paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip() + + # Replace '&' with '&' + paragraph = paragraph.replace('&', '&') + + # Replace 'U.S.' with 'US': + paragraph = paragraph.replace('U.S.', 'US') + + # Some more replacements to improve the default tokenization + for c in '();.,[]"\'-:/%$+@?': + paragraph = paragraph.replace(c, ' {} '.format(c)) + + # Add to the file + text += paragraph.lower() + '\n' + + # Save the title and the text both + filename = 'investopedia_data.txt' + f = open(filename, 'a') + f.write(text) + f.close() + + def _parse_topic_list(self, response): + """ + Parse the page with the topics listed out + e.g. www.investopedia.com/terms/o/ + """ + list_element = response.css('ol.list') + + # Iterate through the list of topics + for l in list_element.css('li'): + # Extract the URL + url = l.css('a::attr(href)').extract_first() + + next_page = response.urljoin(url) + yield scrapy.Request(next_page, callback=self.parse) diff --git a/scrapy-spiders/qplum.py b/scrapy-spiders/qplum.py new file mode 100644 index 0000000..aa9272e --- /dev/null +++ b/scrapy-spiders/qplum.py @@ -0,0 +1,51 @@ +import json +import re + +from scrapy.spiders import CrawlSpider +from w3lib.html import remove_tags, remove_tags_with_content + + +class QplumSpider(CrawlSpider): + name = 'qplum' + start_urls = ['https://www.qplum.co/articles/{}.json'.format(i) for i in range(300)] + + def parse(self, response): + """ + Parse the response page + """ + # Skip error URLs + if response.status != 200: + return + + data = json.loads(response.text) + data = data['content'] + + # Remove