-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnews_scraper.py
More file actions
883 lines (739 loc) · 37.4 KB
/
news_scraper.py
File metadata and controls
883 lines (739 loc) · 37.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
华东理工大学新闻网站通知抓取脚本
自动抓取今日通知并发送到指定邮箱
"""
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import json
import datetime
import logging
import re
import time
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('news_scraper.log', encoding='utf-8'),
logging.StreamHandler()
]
)
class NewsScraperECUST:
def __init__(self, config_file='config.json', emails_file='emails.json'):
"""初始化爬虫"""
self.config_file = config_file
self.emails_file = emails_file
self.config = self.load_config()
self.emails = self.load_emails()
# 学校新闻网站
self.base_url = "https://news.ecust.edu.cn"
self.news_url = "https://news.ecust.edu.cn/16/list.htm"
# 学生处网站
self.student_base_url = "https://student.ecust.edu.cn"
self.student_news_url = "https://student.ecust.edu.cn/1048/list.htm"
# 教务处网站
self.jwc_base_url = "https://jwc.ecust.edu.cn"
self.jwc_news_url = "https://jwc.ecust.edu.cn/main.htm"
# 研究生院网站
self.gschool_base_url = "https://gschool.ecust.edu.cn"
self.gschool_news_url = "https://gschool.ecust.edu.cn/12753/list.htm"
# 代理设置
self.proxies = None
self.original_proxies = None # 保存原始代理配置
self.using_proxy_after_failure = False # 标记是否因失败而使用代理
# 代理设置(只有在配置文件存在时才处理)
if self.config is not None and 'proxy' in self.config and self.config['proxy'].get('enabled', False):
proxy_config = self.config['proxy']
proxy_url = proxy_config.get('url', '')
proxy_username = proxy_config.get('username', '')
proxy_password = proxy_config.get('password', '')
if proxy_url:
# 如果有用户名和密码,则添加到代理URL中
if proxy_username and proxy_password:
# 解析代理URL
from urllib.parse import urlparse, urlunparse
parsed = urlparse(proxy_url)
netloc = f"{proxy_username}:{proxy_password}@{parsed.netloc}"
proxy_url = urlunparse((parsed.scheme, netloc, parsed.path,
parsed.params, parsed.query, parsed.fragment))
self.original_proxies = {
'http': proxy_url,
'https': proxy_url
}
logging.info(f"已配置代理: {proxy_url.replace(proxy_password, '****') if proxy_password else proxy_url}")
# 如果配置了代理但未设置为始终使用,则初始时不使用代理
if not self.config['proxy'].get('always_use', False):
logging.info("代理配置为按需使用(失败时启用)")
self.proxies = None
else:
logging.info("代理配置为始终使用")
self.proxies = self.original_proxies
def load_config(self):
"""加载配置文件"""
try:
with open(self.config_file, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
logging.error(f"配置文件 {self.config_file} 不存在")
return None
def load_emails(self):
"""加载邮箱列表"""
try:
with open(self.emails_file, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
logging.error(f"邮箱文件 {self.emails_file} 不存在")
return None
def get_news_list(self):
"""获取学校新闻网站的新闻列表"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = self.request_with_retry(self.news_url, headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
news_items = []
# 查找新闻列表容器
news_list = soup.find('ul', class_='news_list list2')
if not news_list:
logging.warning("未找到新闻列表容器")
return news_items
# 提取每条新闻
items = news_list.find_all('li', class_='news')
for item in items:
try:
# 提取标题和链接
title_span = item.find('span', class_='news_title')
if not title_span:
continue
title_link = title_span.find('a')
if not title_link:
continue
title = title_link.get('title')
link = title_link.get('href')
# 处理相对链接
if link and link.startswith('/'):
link = self.base_url + link
elif link and not link.startswith('http'):
link = self.base_url + '/' + link
# 提取日期 - 新的日期格式解析
news_meta = item.find('span', class_='news_meta')
if not news_meta:
continue
# 提取日和年月
meta_day = news_meta.find('span', class_='meta_day')
meta_year = news_meta.find('span', class_='meta_year')
if not meta_day or not meta_year:
continue
day = meta_day.get_text(strip=True)
year_month = meta_year.get_text(strip=True) # 格式如 "2025.08"
try:
# 解析年月日
year, month = year_month.split('.')
news_date = datetime.datetime(int(year), int(month), int(day)).date()
except (ValueError, AttributeError):
logging.warning(f"日期解析失败: {day} {year_month}")
continue
news_items.append({
'title': title,
'link': link,
'date': news_date,
'source': '学校新闻网'
})
except Exception as e:
logging.warning(f"解析新闻项时出错: {e}")
continue
return news_items
except Exception as e:
logging.error(f"获取学校新闻列表失败: {e}")
return []
def get_student_news_list(self):
"""获取学生处网站的新闻列表"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = self.request_with_retry(self.student_news_url, headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
news_items = []
# 查找指定的新闻列表容器
col_news_con = soup.find('div', class_='col_news_con')
if col_news_con:
col_news_list = col_news_con.find('div', class_='col_news_list listcon')
if col_news_list:
wp_news = col_news_list.find('div', id='wp_news_w6')
if wp_news:
news_list = wp_news.find('ul', class_='news_list list2')
if news_list:
# 找到了指定的新闻列表容器
news_lists = [news_list]
else:
logging.warning("未找到学生处新闻列表容器 (news_list list2)")
return news_items
else:
logging.warning("未找到学生处新闻列表容器 (wp_news_w6)")
return news_items
else:
logging.warning("未找到学生处新闻列表容器 (col_news_list)")
return news_items
else:
logging.warning("未找到学生处新闻列表容器 (col_news_con)")
return news_items
# 遍历所有找到的新闻列表
for news_list in news_lists:
# 提取每条新闻
items = news_list.find_all('li')
for item in items:
try:
# 提取标题和链接 - 尝试多种可能的结构
title_link = None
# 方式1: 通过 span.news_title > a
title_span = item.find('span', class_='news_title')
if title_span:
title_link = title_span.find('a')
# 方式2: 直接查找 a 标签
if not title_link:
title_link = item.find('a')
if not title_link:
continue
title = title_link.get('title')
if not title:
title = title_link.get_text(strip=True)
link = title_link.get('href')
if not link:
continue
# 处理相对链接
if link.startswith('/'):
link = self.student_base_url + link
elif not link.startswith('http'):
link = self.student_base_url + '/' + link
# 直接从 news_meta 标签获取日期
news_date = None
news_meta = item.find('span', class_='news_meta')
if news_meta:
date_text = news_meta.get_text(strip=True)
# 处理格式为 "2023-11-01" 的日期
try:
year, month, day = date_text.split('-')
news_date = datetime.datetime(int(year), int(month), int(day)).date()
except (ValueError, AttributeError):
logging.debug(f"日期解析失败: {date_text}")
# 如果无法从 news_meta 获取日期,尝试从链接中提取
if not news_date and link:
# 从链接中提取日期 (格式如 /2023/1101/c1048a162142/page.htm)
date_match = re.search(r'/(\d{4})/(\d{2})(\d{2})/', link)
if date_match:
try:
year = int(date_match.group(1))
month = int(date_match.group(2))
day = int(date_match.group(3))
news_date = datetime.datetime(year, month, day).date()
except (ValueError, IndexError):
logging.debug(f"从链接提取日期失败: {link}")
# 如果所有方法都无法提取日期,使用当前日期
if not news_date:
news_date = datetime.date.today()
news_items.append({
'title': title,
'link': link,
'date': news_date,
'source': '学生处'
})
except Exception as e:
logging.warning(f"解析学生处新闻项时出错: {e}")
continue
return news_items
except Exception as e:
logging.error(f"获取学生处新闻列表失败: {e}")
return []
def get_jwc_news_list(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = self.request_with_retry(self.jwc_news_url, headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
news_items = []
# 直接找所有 class="pan7" 的新闻单元格
for td in soup.find_all('td', class_='pan7'):
link_tag = td.find('a')
if not link_tag:
continue
title = link_tag.get('title') or link_tag.get_text(strip=True)
href = link_tag.get('href')
if href.startswith('/'):
full_link = self.jwc_base_url + href
elif 'news.ecust.edu.cn' in href:
continue
else:
full_link = self.jwc_base_url + '/' + href
# —— 改这里:从嵌套 table 的 tr 中获取日期 ——
tr = td.find('tr') # 找嵌套的 tr
news_date = None
if tr:
tds_inner = tr.find_all('td')
if len(tds_inner) >= 2:
date_text = tds_inner[1].get_text(strip=True)
try:
year, month, day = map(int, date_text.split('-'))
news_date = datetime.date(year, month, day)
except Exception:
pass
if not news_date:
# 兜底:从链接提取日期
news_date = self._extract_date_from_link(href)
news_items.append({
'title': title,
'link': full_link,
'date': news_date,
'source': '教务处'
})
return news_items
except Exception as e:
logging.error(f"获取教务处新闻列表失败: {e}")
return []
def get_gschool_news_list(self):
"""获取研究生院网站的新闻列表"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = self.request_with_retry(self.gschool_news_url, headers, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
news_items = []
# 查找新闻列表容器
news_list = soup.find('ul', class_='news_list list2')
if not news_list:
logging.warning("未找到研究生院新闻列表容器")
return news_items
# 提取每条新闻
items = news_list.find_all('li')
for item in items:
try:
# 提取标题和链接
title_span = item.find('span', class_='news_title')
if not title_span:
continue
title_link = title_span.find('a')
if not title_link:
continue
title = title_link.get('title')
if not title:
title = title_link.get_text(strip=True)
link = title_link.get('href')
if not link:
continue
# 处理相对链接
if link.startswith('/'):
link = self.gschool_base_url + link
elif not link.startswith('http'):
link = self.gschool_base_url + '/' + link
# 提取日期
news_date = None
news_meta = item.find('span', class_='news_meta')
if news_meta:
date_text = news_meta.get_text(strip=True)
# 处理格式为 "2023-11-01" 的日期
try:
year, month, day = date_text.split('-')
news_date = datetime.datetime(int(year), int(month), int(day)).date()
except (ValueError, AttributeError):
logging.debug(f"研究生院日期解析失败: {date_text}")
# 如果无法从 news_meta 获取日期,尝试从链接中提取
if not news_date and link:
# 从链接中提取日期 (格式如 /2023/1101/c1048a162142/page.htm)
date_match = re.search(r'/(\d{4})/(\d{2})(\d{2})/', link)
if date_match:
try:
year = int(date_match.group(1))
month = int(date_match.group(2))
day = int(date_match.group(3))
news_date = datetime.datetime(year, month, day).date()
except (ValueError, IndexError):
logging.debug(f"从链接提取研究生院日期失败: {link}")
# 如果所有方法都无法提取日期,使用当前日期
if not news_date:
news_date = datetime.date.today()
news_items.append({
'title': title,
'link': link,
'date': news_date,
'source': '研究生院'
})
except Exception as e:
logging.warning(f"解析研究生院新闻项时出错: {e}")
continue
return news_items
except Exception as e:
logging.error(f"获取研究生院新闻列表失败: {e}")
return []
def _extract_date_from_link(self, href):
"""从链接中提取日期"""
# 从链接中提取日期 (格式如 /2025/0724/c3938a181097/page.htm)
date_match = href.split('/')
if len(date_match) >= 3:
try:
year = int(date_match[1])
month_day = date_match[2]
month = int(month_day[:2])
day = int(month_day[2:4])
return datetime.datetime(year, month, day).date()
except (ValueError, IndexError):
# 如果无法从链接中提取日期,则使用当前日期
logging.warning(f"无法从链接中提取日期: {href}")
return datetime.date.today()
def filter_recent_news(self, news_items):
"""筛选今日的新闻"""
days = self.config.get('days', 0)
today = datetime.date.today()
cutoff_date = today - datetime.timedelta(days)
recent_news = []
for item in news_items:
if item['date'] >= cutoff_date:
recent_news.append(item)
# 按日期降序排列
recent_news.sort(key=lambda x: x['date'], reverse=True)
return recent_news
def filter_news_by_category(self, news_items, categories):
"""根据分类筛选新闻"""
if not categories: # 如果没有指定分类,返回所有新闻
return news_items
filtered_news = []
for item in news_items:
source = item.get('source', '').lower()
# 根据来源匹配分类
if ('jwc' in categories and source == '教务处') or \
('news' in categories and source == '学校新闻网') or \
('student' in categories and source == '学生处') or \
('gschool' in categories and source == '研究生院'): # 注意:目前代码中没有研究生院的爬取逻辑
filtered_news.append(item)
return filtered_news
def generate_email_content(self, news_items):
"""生成邮件内容"""
if not news_items:
return "今日暂无新通知。"
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
html_content = f"""
<html>
<head>
<meta charset="utf-8">
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
.header {{ background-color: #f4f4f4; padding: 20px; text-align: center; }}
.news-item {{ margin: 15px 0; padding: 15px; border-left: 4px solid #007bff; background-color: #f9f9f9; }}
.news-title {{ font-size: 16px; font-weight: bold; margin-bottom: 5px; }}
.news-date {{ color: #666; font-size: 14px; margin-bottom: 10px; }}
.news-link {{ color: #007bff; text-decoration: none; }}
.footer {{ margin-top: 30px; padding: 20px; background-color: #f4f4f4; text-align: center; font-size: 12px; color: #666; }}
</style>
</head>
<body>
<div class="header">
<h2>华东理工大学今日通知</h2>
<p>自动抓取时间: {current_time}</p>
</div>
"""
for item in news_items:
source = item.get('source', '学校新闻网') # 默认为学校新闻网
html_content += f"""
<div class="news-item">
<div class="news-title">{item['title']}</div>
<div class="news-date">发布日期: {item['date']} | 来源: {source}</div>
<div><a href="{item['link']}" class="news-link">查看详情</a></div>
</div>
"""
html_content += """
<div class="footer">
<p>此邮件由自动化脚本发送,请勿回复。</p>
<p>如需停止接收,请访问<a href="https://news.bestzyq.cn/" target="_blank">ECUSTNEWS NOTIFICATION PROJECT网站</a>。</p>
</div>
</body>
</html>
"""
return html_content
def send_error_notification(self, error_message):
"""发送错误通知邮件"""
if not self.config.get('error_notification', {}).get('enabled', False):
return False
recipients = self.config['error_notification'].get('recipients', [])
if not recipients:
logging.warning("错误通知已启用但未配置收件人")
return False
if not self.config.get('smtp'):
logging.error("SMTP配置不存在,无法发送错误通知")
return False
smtp_config = self.config['smtp']
try:
server = smtplib.SMTP_SSL(smtp_config['server'], smtp_config['port'])
server.login(smtp_config['username'], smtp_config['password'])
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
subject = f"华东理工大学新闻抓取错误通知 - {current_time}"
html_content = f"""
<html>
<head>
<meta charset="utf-8">
<style>
body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
.header {{ background-color: #fff3cd; padding: 20px; text-align: center; border: 1px solid #ffeaa7; }}
.error-content {{ margin: 20px; padding: 20px; background-color: #f8d7da; border: 1px solid #f5c6cb; }}
.footer {{ margin-top: 30px; padding: 20px; background-color: #f4f4f4; text-align: center; font-size: 12px; color: #666; }}
</style>
</head>
<body>
<div class="header">
<h2>⚠️ 华东理工大学新闻抓取错误通知</h2>
<p>发生时间: {current_time}</p>
</div>
<div class="error-content">
<h3>错误详情:</h3>
<pre>{error_message}</pre>
<h3>代理使用情况:</h3>
<p>代理配置: {'已配置' if self.original_proxies else '未配置'}</p>
<p>初始模式: {'始终使用代理' if self.config and 'proxy' in self.config and self.config['proxy'].get('always_use', False) else '失败时使用代理'}</p>
<p>当前状态: {'使用代理' if self.proxies else '无代理'}</p>
<p>失败后启用: {'是' if self.using_proxy_after_failure else '否'}</p>
</div>
<div class="footer">
<p>此邮件由自动化脚本发送,请勿回复。</p>
</div>
</body>
</html>
"""
for recipient in recipients:
try:
msg = MIMEMultipart('alternative')
msg['From'] = f"{smtp_config['sender_email']}"
msg['To'] = recipient
msg['Subject'] = subject
html_part = MIMEText(html_content, 'html', 'utf-8')
msg.attach(html_part)
server.send_message(msg)
logging.info(f"错误通知邮件发送成功: {recipient}")
except Exception as e:
logging.error(f"发送错误通知到 {recipient} 失败: {e}")
server.quit()
return True
except Exception as e:
logging.error(f"发送错误通知邮件失败: {e}")
return False
def send_email(self, content, news_count, news_items):
"""发送邮件"""
if not self.config.get('smtp'):
logging.error("SMTP配置不存在")
return False
smtp_config = self.config['smtp']
try:
# 连接SMTP服务器
server = smtplib.SMTP_SSL(smtp_config['server'], smtp_config['port'])
server.login(smtp_config['username'], smtp_config['password'])
# 发送给每个收件人,根据其订阅的分类
success_count = 0
for email_info in self.emails:
try:
# 获取用户订阅的分类
user_categories = email_info.get('categories', [])
# 如果用户没有订阅任何分类,则发送所有新闻
if not user_categories:
filtered_news = news_items
else:
# 根据用户订阅的分类筛选新闻
filtered_news = self.filter_news_by_category(news_items, user_categories)
# 如果没有符合用户订阅分类的新闻,则跳过该用户
if not filtered_news:
logging.info(f"跳过 {email_info['email']},没有符合订阅分类的新闻")
continue
# 为该用户生成邮件内容
user_content = self.generate_email_content(filtered_news)
# 创建邮件
msg = MIMEMultipart('alternative')
msg['From'] = f"{smtp_config['sender_email']}"
msg['To'] = f"{email_info['name']} <{email_info['email']}>"
msg['Subject'] = f"华东理工大学今日通知 ({len(filtered_news)}条)"
# 添加HTML内容
html_part = MIMEText(user_content, 'html', 'utf-8')
msg.attach(html_part)
server.send_message(msg)
logging.info(f"邮件发送成功: {email_info['email']} (分类: {', '.join(user_categories) if user_categories else '全部'})")
success_count += 1
except Exception as e:
logging.error(f"发送邮件到 {email_info['email']} 失败: {e}")
server.quit()
logging.info(f"邮件发送完成,成功 {success_count}/{len(self.emails)} 个")
return success_count > 0
except Exception as e:
logging.error(f"发送邮件失败: {e}")
return False
def check_proxy(self, proxies=None):
"""检查代理是否可用"""
if not proxies:
return True
test_url = "https://www.baidu.com"
try:
logging.info("正在测试代理连接...")
response = requests.get(test_url, proxies=proxies, timeout=5)
if response.status_code == 200:
logging.info("代理连接测试成功")
return True
else:
logging.warning(f"代理连接测试失败,状态码: {response.status_code}")
return False
except Exception as e:
logging.error(f"代理连接测试失败: {e}")
return False
def enable_proxy_after_failure(self):
"""在失败时启用代理"""
if self.original_proxies and not self.using_proxy_after_failure:
logging.info("因请求失败,启用代理进行重试...")
self.proxies = self.original_proxies
self.using_proxy_after_failure = True
return True
return False
def request_with_retry(self, url, headers, timeout=10, max_retries=2):
"""带重试机制的请求方法"""
retries = 0
last_error = None
while retries <= max_retries:
try:
response = requests.get(url, headers=headers, timeout=timeout, proxies=self.proxies)
response.encoding = 'utf-8'
return response
except Exception as e:
last_error = e
retries += 1
logging.warning(f"请求失败 (尝试 {retries}/{max_retries}): {e}")
# 如果是第一次失败且配置了代理但未使用,尝试启用代理
if retries == 1 and self.original_proxies and not self.using_proxy_after_failure:
if self.enable_proxy_after_failure():
logging.info("已启用代理,重试请求...")
continue
if retries <= max_retries:
logging.info(f"{retries}秒后重试...")
time.sleep(retries)
raise last_error
def save_news_to_json(self, news_items, filename='news.json'):
"""将新闻保存到JSON文件"""
try:
# 将日期对象转换为字符串,以便JSON序列化
serializable_news = []
for item in news_items:
news_item = item.copy()
if isinstance(news_item['date'], datetime.date):
news_item['date'] = news_item['date'].strftime('%Y-%m-%d')
serializable_news.append(news_item)
with open(filename, 'w', encoding='utf-8') as f:
json.dump(serializable_news, f, ensure_ascii=False, indent=4)
logging.info(f"成功保存 {len(news_items)} 条新闻到 {filename}")
return True
except Exception as e:
logging.error(f"保存新闻到JSON文件失败: {e}")
return False
def run(self):
"""运行主程序"""
logging.info("开始抓取华东理工大学新闻...")
# 检查配置文件是否存在
if self.config is None:
logging.error("配置文件不存在,程序退出")
return
# 检查邮箱文件是否存在
if self.emails is None:
logging.error("邮箱文件不存在,程序退出")
return
# 如果启用了代理且设置为始终使用,先测试代理是否可用
if self.proxies and not self.check_proxy(self.proxies):
logging.error("代理不可用,程序退出")
if self.config.get('error_notification', {}).get('enabled', False):
self.send_error_notification("代理连接测试失败")
return
all_news_items = []
errors = []
try:
# 获取学校新闻网站的新闻列表
try:
school_news = self.get_news_list()
if school_news:
logging.info(f"获取到学校新闻网站 {len(school_news)} 条新闻")
all_news_items.extend(school_news)
else:
logging.warning("未获取到学校新闻网站的新闻")
except Exception as e:
error_msg = f"获取学校新闻网站失败: {str(e)}"
logging.error(error_msg)
errors.append(error_msg)
# 获取学生处网站的新闻列表
try:
student_news = self.get_student_news_list()
if student_news:
logging.info(f"获取学生处网站 {len(student_news)} 条新闻")
all_news_items.extend(student_news)
else:
logging.warning("未获取到学生处网站的新闻")
except Exception as e:
error_msg = f"获取学生处网站失败: {str(e)}"
logging.error(error_msg)
errors.append(error_msg)
# 获取教务处网站的新闻列表
try:
jwc_news = self.get_jwc_news_list()
if jwc_news:
logging.info(f"获取到教务处网站 {len(jwc_news)} 条新闻")
all_news_items.extend(jwc_news)
else:
logging.warning("未获取到教务处网站的新闻")
except Exception as e:
error_msg = f"获取教务处网站失败: {str(e)}"
logging.error(error_msg)
errors.append(error_msg)
# 获取研究生院网站的新闻列表
try:
gschool_news = self.get_gschool_news_list()
if gschool_news:
logging.info(f"获取到研究生院网站 {len(gschool_news)} 条新闻")
all_news_items.extend(gschool_news)
else:
logging.warning("未获取到研究生院网站的新闻")
except Exception as e:
error_msg = f"获取研究生院网站失败: {str(e)}"
logging.error(error_msg)
errors.append(error_msg)
if not all_news_items and errors:
logging.warning("未获取到任何新闻,但有错误发生")
if self.config.get('error_notification', {}).get('enabled', False):
error_details = "\n".join(errors)
self.send_error_notification(f"所有网站抓取失败:\n{error_details}")
return
logging.info(f"总共获取到 {len(all_news_items)} 条新闻")
# 筛选今日新闻
recent_news = self.filter_recent_news(all_news_items)
logging.info(f"筛选出最近 {self.config.get('days', 0)} 天内的新闻 {len(recent_news)} 条")
# 保存新闻到JSON文件
self.save_news_to_json(recent_news)
# 只有在有新通知时才发送邮件
if recent_news and self.emails:
# 直接传递新闻列表,在send_email中根据用户订阅分类进行筛选
self.send_email("", len(recent_news), recent_news)
elif not recent_news:
logging.info("最近无新通知,不发送邮件")
else:
logging.warning("没有配置收件人邮箱")
# 如果有错误发生但最终成功获取了部分新闻,也发送错误通知
if errors and self.config.get('error_notification', {}).get('enabled', False):
error_details = "\n".join(errors)
self.send_error_notification(f"部分网站抓取失败但仍有数据:\n{error_details}")
except Exception as e:
error_msg = f"程序执行过程中发生未预期错误: {str(e)}"
logging.error(error_msg)
if self.config.get('error_notification', {}).get('enabled', False):
self.send_error_notification(error_msg)
raise
logging.info("程序执行完成")
def main():
"""主函数"""
scraper = NewsScraperECUST()
scraper.run()
if __name__ == "__main__":
main()