add scrapy

shaform · Feb 27, 2016 · 94027be · 94027be
1 parent cbb467e
commit 94027be
Show file tree

Hide file tree

Showing 9 changed files with 4,445 additions and 0 deletions.
diff --git a/scrapy/PTT Analysis.ipynb b/scrapy/PTT Analysis.ipynb
diff --git a/scrapy/ptt/__init__.py b/scrapy/ptt/__init__.py
diff --git a/scrapy/ptt/items.py b/scrapy/ptt/items.py
@@ -0,0 +1,16 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class PostItem(scrapy.Item):
+    title = scrapy.Field()
+    author = scrapy.Field()
+    date = scrapy.Field()
+    content = scrapy.Field()
+    comments = scrapy.Field()
+    score = scrapy.Field()
+    url = scrapy.Field()
diff --git a/scrapy/ptt/pipelines.py b/scrapy/ptt/pipelines.py
@@ -0,0 +1,9 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class PTTPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/scrapy/ptt/settings.py b/scrapy/ptt/settings.py
@@ -0,0 +1,87 @@
+# Scrapy settings for ptt project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'ptt'
+
+SPIDER_MODULES = ['ptt.spiders']
+NEWSPIDER_MODULE = 'ptt.spiders'
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'ptt (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 1.25
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'ptt.middlewares.MyCustomSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'ptt.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'ptt.pipelines.SomePipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/scrapy/ptt/spiders/__init__.py b/scrapy/ptt/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/scrapy/ptt/spiders/ptt.py b/scrapy/ptt/spiders/ptt.py
@@ -0,0 +1,91 @@
+import logging
+
+from datetime import datetime
+
+import scrapy
+
+from scrapy.http import FormRequest
+
+from ptt.items import PostItem
+
+
+class PTTSpider(scrapy.Spider):
+    name = 'ptt'
+    allowed_domains = ['ptt.cc']
+    start_urls = ('https://www.ptt.cc/bbs/Gossiping/index.html', )
+
+    _retries = 0
+    MAX_RETRY = 1
+
+    _pages = 0
+    MAX_PAGES = 2
+
+    def parse(self, response):
+        if len(response.xpath('//div[@class="over18-notice"]')) > 0:
+            if self._retries < PTTSpider.MAX_RETRY:
+                self._retries += 1
+                logging.warning('retry {} times...'.format(self._retries))
+                yield FormRequest.from_response(response,
+                                                formdata={'yes': 'yes'},
+                                                callback=self.parse)
+            else:
+                logging.warning('you cannot pass')
+
+        else:
+            self._pages += 1
+            for href in response.css('.r-ent > div.title > a::attr(href)'):
+                url = response.urljoin(href.extract())
+                yield scrapy.Request(url, callback=self.parse_post)
+
+            if self._pages < PTTSpider.MAX_PAGES:
+                next_page = response.xpath(
+                    '//div[@id="action-bar-container"]//a[contains(text(), "上頁")]/@href')
+                if next_page:
+                    url = response.urljoin(next_page[0].extract())
+                    logging.warning('follow {}'.format(url))
+                    yield scrapy.Request(url, self.parse)
+                else:
+                    logging.warning('no next page')
+            else:
+                logging.warning('max pages reached')
+
+    def parse_post(self, response):
+        item = PostItem()
+        item['title'] = response.xpath(
+            '//meta[@property="og:title"]/@content')[0].extract()
+        item['author'] = response.xpath(
+            '//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[
+                0].extract().split(' ')[0]
+        datetime_str = response.xpath(
+            '//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[
+                0].extract()
+        item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')
+
+        item['content'] = response.xpath('//div[@id="main-content"]/text()')[
+            0].extract()
+
+        comments = []
+        total_score = 0
+        for comment in response.xpath('//div[@class="push"]'):
+            push_tag = comment.css('span.push-tag::text')[0].extract()
+            push_user = comment.css('span.push-userid::text')[0].extract()
+            push_content = comment.css('span.push-content::text')[0].extract()
+
+            if '推' in push_tag:
+                score = 1
+            elif '噓' in push_tag:
+                score = -1
+            else:
+                score = 0
+
+            total_score += score
+
+            comments.append({'user': push_user,
+                             'content': push_content,
+                             'score': score})
+
+        item['comments'] = comments
+        item['score'] = total_score
+        item['url'] = response.url
+
+        yield item
diff --git a/scrapy/requirments.txt b/scrapy/requirments.txt
@@ -0,0 +1,4 @@
+Scrapy>=1.1.0rc1
+jieba>=0.38
+seaborn>=0.7.0
+scikit-learn>=0.16.1
diff --git a/scrapy/scrapy.cfg b/scrapy/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = ptt.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = ptt