Skip to content

Commit

Permalink
get more text
Browse files Browse the repository at this point in the history
  • Loading branch information
shaform committed Aug 27, 2016
1 parent 4a2c3fc commit 457e0bd
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 2 deletions.
6 changes: 4 additions & 2 deletions scrapy/ptt/spiders/ptt.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from datetime import datetime

import html2text
import scrapy

from scrapy.http import FormRequest
Expand Down Expand Up @@ -61,8 +62,9 @@ def parse_post(self, response):
0].extract()
item['date'] = datetime.strptime(datetime_str, '%a %b %d %H:%M:%S %Y')

item['content'] = response.xpath('//div[@id="main-content"]/text()')[
0].extract()
converter = html2text.HTML2Text()
converter.ignore_links = True
item['content'] = converter.handle(response.xpath('//div[@id="main-content"]')[ 0].extract())

comments = []
total_score = 0
Expand Down
1 change: 1 addition & 0 deletions scrapy/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ scipy>=0.17.0
scikit-learn>=0.16.1
cryptography>=1.2.2
pyOpenSSL>=0.15.1
html2text>=2016.5.29

0 comments on commit 457e0bd

Please sign in to comment.