Skip to content

Commit

Permalink
add scrapy
Browse files Browse the repository at this point in the history
  • Loading branch information
shaform committed Feb 27, 2016
1 parent cbb467e commit 94027be
Show file tree
Hide file tree
Showing 9 changed files with 4,445 additions and 0 deletions.
4,223 changes: 4,223 additions & 0 deletions scrapy/PTT Analysis.ipynb

Large diffs are not rendered by default.

Empty file added scrapy/ptt/__init__.py
Empty file.
16 changes: 16 additions & 0 deletions scrapy/ptt/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class PostItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
date = scrapy.Field()
content = scrapy.Field()
comments = scrapy.Field()
score = scrapy.Field()
url = scrapy.Field()
9 changes: 9 additions & 0 deletions scrapy/ptt/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class PTTPipeline(object):
def process_item(self, item, spider):
return item
87 changes: 87 additions & 0 deletions scrapy/ptt/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Scrapy settings for ptt project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'ptt'

SPIDER_MODULES = ['ptt.spiders']
NEWSPIDER_MODULE = 'ptt.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ptt (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1.25
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'ptt.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'ptt.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'ptt.pipelines.SomePipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
4 changes: 4 additions & 0 deletions scrapy/ptt/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
91 changes: 91 additions & 0 deletions scrapy/ptt/spiders/ptt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import logging

from datetime import datetime

import scrapy

from scrapy.http import FormRequest

from ptt.items import PostItem


class PTTSpider(scrapy.Spider):
name = 'ptt'
allowed_domains = ['ptt.cc']
start_urls = ('https://www.ptt.cc/bbs/Gossiping/index.html', )

_retries = 0
MAX_RETRY = 1

_pages = 0
MAX_PAGES = 2

def parse(self, response):
if len(response.xpath('//div[@class="over18-notice"]')) > 0:
if self._retries < PTTSpider.MAX_RETRY:
self._retries += 1
logging.warning('retry {} times...'.format(self._retries))
yield FormRequest.from_response(response,
formdata={'yes': 'yes'},
callback=self.parse)
else:
logging.warning('you cannot pass')

else:
self._pages += 1
for href in response.css('.r-ent > div.title > a::attr(href)'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_post)

if self._pages < PTTSpider.MAX_PAGES:
next_page = response.xpath(
'//div[@id="action-bar-container"]//a[contains(text(), "上頁")]/@href')
if next_page:
url = response.urljoin(next_page[0].extract())
logging.warning('follow {}'.format(url))
yield scrapy.Request(url, self.parse)
else:
logging.warning('no next page')
else:
logging.warning('max pages reached')

def parse_post(self, response):
item = PostItem()
item['title'] = response.xpath(
'//meta[@property="og:title"]/@content')[0].extract()
item['author'] = response.xpath(
'//div[@class="article-metaline"]/span[text()="作者"]/following-sibling::span[1]/text()')[
0].extract().split(' ')[0]
datetime_str = response.xpath(
'//div[@class="article-metaline"]/span[text()="時間"]/following-sibling::span[1]/text()')[
0].extract()
item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')

item['content'] = response.xpath('//div[@id="main-content"]/text()')[
0].extract()

comments = []
total_score = 0
for comment in response.xpath('//div[@class="push"]'):
push_tag = comment.css('span.push-tag::text')[0].extract()
push_user = comment.css('span.push-userid::text')[0].extract()
push_content = comment.css('span.push-content::text')[0].extract()

if '推' in push_tag:
score = 1
elif '噓' in push_tag:
score = -1
else:
score = 0

total_score += score

comments.append({'user': push_user,
'content': push_content,
'score': score})

item['comments'] = comments
item['score'] = total_score
item['url'] = response.url

yield item
4 changes: 4 additions & 0 deletions scrapy/requirments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Scrapy>=1.1.0rc1
jieba>=0.38
seaborn>=0.7.0
scikit-learn>=0.16.1
11 changes: 11 additions & 0 deletions scrapy/scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = ptt.settings

[deploy]
#url = http://localhost:6800/
project = ptt

0 comments on commit 94027be

Please sign in to comment.