Adding the crawler code

hardikp · Jun 29, 2017 · 936f4df · 936f4df
1 parent a0ed5dc
commit 936f4df
Show file tree

Hide file tree

Showing 4 changed files with 246 additions and 0 deletions.
diff --git a/scrapy-spiders/__init__.py b/scrapy-spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/scrapy-spiders/investopedia.py b/scrapy-spiders/investopedia.py
@@ -0,0 +1,85 @@
+from string import ascii_lowercase
+
+import scrapy
+from scrapy.spiders import CrawlSpider
+from w3lib.html import remove_tags, remove_tags_with_content
+
+
+class InvestopediaSpider(CrawlSpider):
+    name = 'investopedia'
+    start_urls = ['http://www.investopedia.com/terms/%s/' % s for s in ascii_lowercase + '1']
+
+    def parse(self, response):
+        """
+        Parse the response page
+        """
+        url = response.url
+
+        # 'terms' has to be there in the URL to proceed further
+        if 'terms' not in url:
+            return
+
+        # if the url ends with '.asp', then that's a topic page
+        if url.endswith('.asp'):
+            return self._parse_topic_response(response)
+
+        # Otherwise, assume that this a list page
+        return self._parse_topic_list(response)
+
+    def _parse_topic_response(self, response):
+        """
+        Parses various topics
+        e.g. www.investopedia.com/terms/o/oddlottheory.asp
+        """
+        # Get the title first
+        title = response.css('title::text').extract_first()
+
+        # Replace / with a space - creates issues with writing to file
+        title = title.replace('/', ' ')
+
+        # Get the first div with id Content
+        content = response.css('div#Content')[0]
+        content = content.css('div.content-box')
+
+        text = ''
+        for child in content.xpath('//p'):
+
+            # Get the text from this child <p></p> tag
+            paragraph = child.extract()
+
+            # Remove tags including <p> and <a>
+            paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()
+
+            # Replace '&amp;' with '&'
+            paragraph = paragraph.replace('&amp;', '&')
+
+            # Replace 'U.S.' with 'US':
+            paragraph = paragraph.replace('U.S.', 'US')
+
+            # Some more replacements to improve the default tokenization
+            for c in '();.,[]"\'-:/%$+@?':
+                paragraph = paragraph.replace(c, ' {} '.format(c))
+
+            # Add to the file
+            text += paragraph.lower() + '\n'
+
+        # Save the title and the text both
+        filename = 'investopedia_data.txt'
+        f = open(filename, 'a')
+        f.write(text)
+        f.close()
+
+    def _parse_topic_list(self, response):
+        """
+        Parse the page with the topics listed out
+        e.g. www.investopedia.com/terms/o/
+        """
+        list_element = response.css('ol.list')
+
+        # Iterate through the list of topics
+        for l in list_element.css('li'):
+            # Extract the URL
+            url = l.css('a::attr(href)').extract_first()
+
+            next_page = response.urljoin(url)
+            yield scrapy.Request(next_page, callback=self.parse)
diff --git a/scrapy-spiders/qplum.py b/scrapy-spiders/qplum.py
@@ -0,0 +1,51 @@
+import json
+import re
+
+from scrapy.spiders import CrawlSpider
+from w3lib.html import remove_tags, remove_tags_with_content
+
+
+class QplumSpider(CrawlSpider):
+    name = 'qplum'
+    start_urls = ['https://www.qplum.co/articles/{}.json'.format(i) for i in range(300)]
+
+    def parse(self, response):
+        """
+        Parse the response page
+        """
+        # Skip error URLs
+        if response.status != 200:
+            return
+
+        data = json.loads(response.text)
+        data = data['content']
+
+        # Remove <script>, <sup>, <math> tags with the content
+        paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style'))
+        # Remove the rest of the tags without removing the content
+        paragraph = remove_tags(paragraph)
+
+        # Replace &amp; with &
+        paragraph = paragraph.replace('&amp;', '&')
+        # Replace &#39; with '
+        paragraph = paragraph.replace('&#39;', "'")
+        paragraph = paragraph.replace('&rsquo;', "'")
+        paragraph = paragraph.replace('&ldquo;', "'")
+        paragraph = paragraph.replace('&rdquo;', "'")
+        # Replace &nbsp; with a space
+        paragraph = re.sub("&.....;", ' ', paragraph)
+        paragraph = re.sub("&....;", ' ', paragraph)
+
+        # Replace 'U.S.' with 'US':
+        paragraph = paragraph.replace('U.S.', 'US')
+
+        # Some more replacements to improve the default tokenization
+        for c in ['\n', '\r', '\t']:
+            paragraph = paragraph.replace(c, ' ')
+        for c in '();.,[]"\'-:/%$+@?':
+            paragraph = paragraph.replace(c, ' {} '.format(c))
+
+        filename = 'qplum_data.txt'
+        f = open(filename, 'a')
+        f.write(paragraph.lower() + '\n')
+        f.close()
diff --git a/scrapy-spiders/wikipedia.py b/scrapy-spiders/wikipedia.py
@@ -0,0 +1,106 @@
+import scrapy
+from scrapy.spiders import CrawlSpider
+from w3lib.html import remove_tags, remove_tags_with_content
+
+
+class WikipediaSpider(CrawlSpider):
+    name = 'wikipedia'
+    start_urls = ['https://en.wikipedia.org/wiki/Outline_of_finance']
+
+    def parse(self, response):
+        """
+        Parse the response page
+        """
+        url = response.url
+
+        if url in WikipediaSpider.start_urls:
+            return self._parse_topic_list(response)
+
+        else:
+            self.parse_topic_response(response)
+            return self._parse_links(response)
+
+    def parse_topic_response(self, response):
+        """
+        Parse the content
+        """
+
+        # Get the title first
+        title = response.css('title::text').extract_first()
+
+        # Replace / with a space - creates issues with writing to file
+        title = title.replace('/', ' ')
+
+        content = response.css('div#mw-content-text')
+
+        # Just extract all the '<p></p>' children from this
+        text = ''
+        for child in content.xpath('//p'):
+
+            # Get the text from this child <p></p> tag
+            paragraph = child.extract()
+
+            # Remove <script>, <sup>, <math> tags with the content
+            paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math'))
+            # Remove the rest of the tags without removing the content
+            paragraph = remove_tags(paragraph)
+
+            # Replace '&amp;' with '&'
+            paragraph = paragraph.replace('&amp;', '&')
+
+            # Replace 'U.S.' with 'US':
+            paragraph = paragraph.replace('U.S.', 'US')
+
+            # Some more replacements to improve the default tokenization
+            for c in '();.,[]"\'-:/%$+@?':
+                paragraph = paragraph.replace(c, ' {} '.format(c))
+
+            # Add to the file
+            text += paragraph.lower() + '\n'
+
+        filename = 'wiki_data.txt'
+        f = open(filename, 'a')
+        f.write(text)
+        f.close()
+
+    def _parse_links(self, response):
+        """
+        Parses the links from the first level of pages
+        """
+        content = response.css('div#mw-content-text')
+
+        for child in content.xpath('//p'):
+            # Extract the URLs
+            urls = child.css('a::attr(href)').extract()
+
+            for url in urls:
+                if url is None or 'wiki' not in url:
+                    continue
+
+                next_page = response.urljoin(url)
+                yield scrapy.Request(next_page, callback=self.parse_topic_response)
+
+    def _parse_topic_list(self, response):
+        """
+        Parse various topics from the list of topics
+        """
+
+        # All of the links on this pages are in the bullet points
+        # Therefore, extract the 'ul' tags to get the list
+        content = response.css('div#mw-content-text')
+        lists = content.css('ul')
+
+        # Iterate through each list
+        for ul in lists:
+
+            # Iterate through each list item
+            for l in ul.css('li'):
+                # Extract the URL
+                url = l.css('a::attr(href)').extract_first()
+
+                # Skip external links as well as the links to the same page (e.g. TOC)
+                if url is None or 'wiki' not in url:
+                    continue
+
+                next_page = response.urljoin(url)
+                yield scrapy.Request(next_page, callback=self.parse)