-
Notifications
You must be signed in to change notification settings - Fork 0
/
PTT.py
266 lines (207 loc) · 6.43 KB
/
PTT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import json
def get_requests(board):
'''送出requests Sesstion 通過十八歲確認
Args:
borad:看板英文名稱
Return:
s:requests Session
res:get 結果
'''
s = requests.Session()
# 看板
url = "https://www.ptt.cc/bbs/" + board + "/index.html"
res = s.get(url)
soup = BeautifulSoup(res.text, "lxml")
# 檢查是否需要滿十八歲確認
if len(soup.select(".over18-notice")) != 0:
over18_url = "https://www.ptt.cc/ask/over18"
payload = {
"from": "/bbs/" + board + "/index.html",
"yes": "yes"
}
s.post(over18_url, data = payload, verify = False)
res = s.get(url, verify = False)
# 回傳 requests Session 和 s.get 結果
return (s, res)
def get_posts_info(s, page_url):
'''獲取該頁文章標題、推文數、po文id、文章網址
Args:
s:get_pages function的Session
page_url:每頁的網址
Return:
posts_info:List,文章標題、推文數、po文id、文章網址
'''
res = s.get(page_url)
if res.status_code == 500:
print ("oops! 500")
return("stop!")
soup = BeautifulSoup(res.text, "lxml")
# return list
posts_info = []
# 獲取文章標題、推文數、po文id、文章網址
for post in soup.select(".r-ent"):
push = post.select(".nrec")[0].text.strip() # 推文數
try:
title = post.select(".title")[0]
# 文章網址
post_url = title.select("a")[0]["href"]
post_url = "https://www.ptt.cc" + post_url
title = title.text.strip() # 文章標題
# 略過公告文章
if "[公告]" in title:
continue
author = post.select(".author")[0].text.strip() # po文id
# 將每篇文章簡單資訊儲存到 posts_info_all
posts_info.append([push, title, author, post_url, page_url])
except:
continue
return (posts_info)
# def get_page(s, res, board, pages):
def get_page(s, res, board, index):
'''獲取每頁的url並抓每頁文章標題、推文數、po文id、文章網址
Args:
s:get_pages function的Session
res:get_pages function的requests
borad:看板英文名稱
pages:要爬取的頁數,包含最新頁總共的頁數
Return:
posts_info_all:List, 多頁的文章標題、推文數、po文id、文章網址
'''
soup = BeautifulSoup(res.text, "lxml")
previous = soup.select(".wide")[1]["href"]
previous = int(previous[(previous.find("index")+5):previous.find(".html")])
posts_info_all = []
# 爬取看板每頁文章簡單資訊
# for page in range(pages):
while True:
# 該頁網址
# page_url = "https://www.ptt.cc/bbs/" + board + "/index" + str(previous - page + 1) + ".html"
page_url = "https://www.ptt.cc/bbs/" + board + "/index" + str(index) + ".html"
print (page_url)
# 獲取該頁各文章 推文數、標題、作者、網址
posts_info = get_posts_info(s, page_url)
if posts_info == "stop!":
print ("im stoping here")
return(posts_info_all)
# 將每篇文章簡單資訊儲存到 posts_info_all
posts_info_all.extend(posts_info)
index = index + 1
# return (posts_info_all)
def get_post_content(s, post_url):
'''獲取每篇文章詳細資訊,包含po文時間、作者ip、文章內容、推文內容
Args:
s:get_pages function的Session
post_url:the url of a post
'''
print (post_url)
res = s.get(post_url)
soup = BeautifulSoup(res.text, "lxml")
main_content = soup.select("#main-content")[0]
# print (main_content)
str_main_content = str(main_content)
# 發文時間
article_time = main_content.select(".article-metaline")[2].select(".article-meta-value")[0].text
# print (article_time)
# 發文ip
ips = main_content.select(".f2")[:3]
ip = ""
for i in ips:
if "發信站" in str(i):
ip = i.text.replace("※ 發信站: 批踢踢實業坊(ptt.cc), 來自: ", "").strip()
if ip == "※ 發信站: 批踢踢實業坊(ptt.cc)":
start = str_main_content.find("◆ From: ") + 8
end = str_main_content.find('\n', start)
ip = str_main_content[start:end]
break
# 發文內容
start = str_main_content.find(str(article_time)) + len(article_time) + 13
end = str_main_content.find('--', start)
# end = str_main_content.rfind('--') # 找到最後的 "--",文章內文結束處
content = str_main_content[start:end].strip()
content = BeautifulSoup(content, "lxml").text#.replace("\n", " ")
# print (content)
# 推/噓
# push = main_content.select(".push")
end = str_main_content.rfind('--') # 找到最後的 "--",文章內文結束處
push = str_main_content[end:]
push = BeautifulSoup(push, "lxml").select(".push")
# print (push)
push_info = list()
for p in push:
# print (p.text)
try:
push_tag = p.select(".push-tag")[0].text[:-1]
except:
continue
push_userid = p.select(".push-userid")[0].text
push_content = p.select(".push-content")[0].text[1:].strip()
push_ipdatetime = p.select(".push-ipdatetime")[0].text.strip().split()
try:
push_ip = push_ipdatetime[0]
push_date = push_ipdatetime[1]
push_time = push_ipdatetime[2]
except:
push_ip = ""
push_date = ""
push_time = ""
for j in push_ipdatetime:
if "." in j:
push_ip = j
if "/" in j:
push_date = j
if ":" in j:
push_time = j
push_info.append([push_tag, push_userid, push_content, push_ip, push_date, push_time])
return(article_time, ip, content, push_info)
# def run(board, pages):
def run(board, index):
# 獲取各版po文詳細資訊(包含po文者id、po文內容與時間等等)
'''Args:
board:看板名稱
pages:要爬取的頁數
'''
# 網站session
s, res = get_requests(board)
# 文章簡單資訊
# posts_info_all = get_page(s, res, board, pages)
posts_info_all = get_page(s, res, board, index)
n = len(posts_info_all)
all_info = {}
for i in range(n):
post = posts_info_all[i]
push = post[0]
title = post[1]
author = post[2]
post_url = post[3]
page_url = post[4]
# 文章詳細資訊
article_time, ip, content, push_info = get_post_content(s, post_url)
# 整理成 dictionary 格式
all_info[post_url] = {
"id":i+1,
"title":title,
"author":author,
"ip":ip,
"article_time":article_time,
"content":content,
"push_info":{
"push_number":push,
"push":push_info
},
# "post_url":post_url,
"page_url":page_url
}
if (i+1)%1000 == 0:
with open(board + + ".json", "w") as f:
json.dump(all_info, f)
all_info = {}
if all_info != {}:
with open("test.json", "w") as f:
json.dump(all_info, f)
print ("done!")
# return (all_info)
all_info = run("Gossiping", 39440)
# print (all_info)