-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathmain.py
More file actions
596 lines (543 loc) · 25.9 KB
/
main.py
File metadata and controls
596 lines (543 loc) · 25.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
from urllib.parse import urljoin
import sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
import json
import os
import pandas as pd
import sys
BASE_URL = 'https://www.wenku8.net/modules/article/reviewslist.php'
params = { 'keyword': '8691', 'charset': 'utf-8', 'page': 1 }
# 'requests' | 'playwright' | 'steel'
_scraper = 'steel'
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
]
HEADERS = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'User-Agent': random.choice(user_agents),
'Referer': 'https://www.wenku8.net/',
}
DOMAIN = 'https://www.wenku8.net'
JSDELIVR_CDN = 'https://gcore.jsdelivr.net/gh/mojimoon/wenku8@gh-pages/'
OUT_DIR = 'out'
PUBLIC_DIR = 'docs'
COOKIE_FILE = os.path.join(os.path.dirname(__file__), 'COOKIE')
POST_LIST_FILE = os.path.join(OUT_DIR, 'post_list.csv')
TXT_LIST_FILE = os.path.join(OUT_DIR, 'txt_list.csv')
DL_FILE = os.path.join(OUT_DIR, 'dl.txt')
MERGED_CSV = os.path.join(OUT_DIR, 'merged.csv')
EPUB_HTML = os.path.join(PUBLIC_DIR, 'epub.html')
MERGED_HTML = os.path.join(PUBLIC_DIR, 'index.html')
retry_strategy = Retry(
total=5,
status_forcelist=[500, 502, 503, 504],
backoff_factor=2
)
session = requests.Session()
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.headers.update(HEADERS)
def parse_cookie_line(line: str):
line = line.strip()
if not line:
return {}
cookie_dict = {}
for part in line.split(';'):
part = part.strip()
if not part or '=' not in part:
continue
k, v = part.split('=', 1)
cookie_dict[k.strip()] = v.strip()
return cookie_dict
def load_cookie_from_file(sess: requests.Session, filepath: str):
if not os.path.exists(filepath):
return
with open(filepath, 'r', encoding='utf-8') as f:
# 只取第一行,整行都是 "k1=v1; k2=v2; ..."
line = f.readline()
cookie_dict = parse_cookie_line(line)
if cookie_dict:
jar = requests.utils.cookiejar_from_dict(cookie_dict)
sess.cookies.update(jar)
load_cookie_from_file(session, COOKIE_FILE)
browser = None
playwright_ctx_cookie_dict = None
steel_dict = None
def init_playwright():
from playwright.sync_api import sync_playwright
global browser, playwright_ctx_cookie_dict
if browser is None:
playwright = sync_playwright().start()
browser = playwright.chromium.launch(
headless=True,
args=['--no-sandbox', '--disable-setuid-sandbox']
)
# 预解析 COOKIE_FILE,供后面 new_context 使用
if os.path.exists(COOKIE_FILE):
with open(COOKIE_FILE, 'r', encoding='utf-8') as f:
line = f.readline()
playwright_ctx_cookie_dict = parse_cookie_line(line)
else:
playwright_ctx_cookie_dict = {}
return browser
def init_steel():
from steel import Steel
from dotenv import dotenv_values
from playwright.sync_api import sync_playwright
global browser, playwright_ctx_cookie_dict, steel_dict
steel_api_key = dotenv_values().get('STEEL_API_KEY', '')
client = Steel(steel_api_key=steel_api_key)
steel_session = client.sessions.create(api_timeout=40000)
print(f'[INFO] Running Steel session: {steel_session.id}')
steel_dict = {
'api_key': steel_api_key,
'session_id': steel_session.id,
'client': client
}
if browser is None:
playwright = sync_playwright().start()
browser = playwright.chromium.connect_over_cdp(
f'wss://connect.steel.dev?apiKey={steel_api_key}&sessionId={steel_session.id}'
)
if os.path.exists(COOKIE_FILE):
with open(COOKIE_FILE, 'r', encoding='utf-8') as f:
line = f.readline()
playwright_ctx_cookie_dict = parse_cookie_line(line)
else:
playwright_ctx_cookie_dict = {}
return browser
def exit_steel():
browser.close()
client = steel_dict['client']
session_id = steel_dict['session_id']
client.sessions.release(session_id)
def scrape_page_playwright(url: str):
global browser, playwright_ctx_cookie_dict
if browser is None:
browser = (init_steel() if _scraper == 'steel' else init_playwright())
# 每次新建 context,并注入 cookie
with browser.new_context() as context:
if playwright_ctx_cookie_dict:
cookies = [
{
"name": k,
"value": v,
"domain": "www.wenku8.net",
"path": "/",
# 可按需设置 "httpOnly" / "secure" / "sameSite"
}
for k, v in playwright_ctx_cookie_dict.items()
]
context.add_cookies(cookies)
page = context.new_page()
try:
page.goto(url, wait_until='domcontentloaded')
except Exception as e:
print(f"[WARN] Page.goto encountered an error or timeout, attempting to proceed: {e}")
if "/login.php" in page.url:
raise ValueError(f"[ERROR] Playwright 模式被重定向到登录页,可能需要更新 COOKIE 文件: {page.url}")
html_content = page.content()
page.close()
return html_content
def scrape_page_requests(url: str):
resp = session.get(url, timeout=10, allow_redirects=True)
final_url = resp.url
if '/login.php' in final_url:
raise ValueError(f"[ERROR] Requests 模式被重定向到登录页,可能需要更新 COOKIE 文件: {final_url}")
resp.raise_for_status()
resp.encoding = 'utf-8'
# with open('debug.html', 'w', encoding='utf-8') as f:
# f.write(resp.text)
return resp.text
def scrape_page(url: str):
if _scraper == 'playwright' or _scraper == 'steel':
return scrape_page_playwright(url)
elif _scraper == 'requests':
return scrape_page_requests(url)
else:
raise ValueError(f"Unknown _scraper: {_scraper}")
def build_url_with_params(base_url: str, params: dict):
if not params:
return base_url
query_string = '&'.join(f"{key}={value}" for key, value in params.items())
# print(f'[DEBUG] Built URL: {base_url}?{query_string}')
return f"{base_url}?{query_string}"
# ========== Scraping ==========
last_page = 1
def get_latest_url(post_link: str):
txt = scrape_page(post_link)
# <a href="https://paste.gentoo.zip" target="_blank">https://paste.gentoo.zip</a>/EsX5Kx8V
match = re.search(r'<a href="([^"]+)" target="_blank">([^<]+)</a>(/[^<]+)', txt)
link = match.group(1) + match.group(3) if match else None
if link is None:
# <a href="https://0x0.st/8QWZ.txt" target="_blank">https://0x0.st/8QWZ.txt</a><br>
match = re.search(r'https:\/\/[^"]+?\.txt(?=")', txt)
if match:
link = match.group(0)
else:
raise ValueError("[ERROR] Failed to find the latest URL")
return link
def get_latest(url: str):
txt = scrape_page(url)
lines = txt.split('\n')
flg = [False] * 4
for i in range(len(lines)):
if not flg[0] and lines[i].endswith('_杂志连载版'):
lines[i] = lines[i].replace('_杂志连载版', '')
flg[0] = True
elif not flg[1] and lines[i].endswith('_SS'):
lines[i] = lines[i].replace('_SS', '')
flg[1] = True
elif not flg[2] and lines[i].endswith('-Ordinary_days-'):
lines[i] = lines[i].replace('-Ordinary_days-', ' 莉可丽丝 Ordinary days')
flg[2] = True
elif not flg[3] and lines[i].endswith('君若星辰'):
lines[i] = lines[i].replace('君若星辰', '宛如星辰的你')
flg[3] = True
txt = '\n'.join(lines)
# if the content has not changed, exit
if os.path.exists(DL_FILE):
with open(DL_FILE, 'r', encoding='utf-8') as f:
old_txt = f.read()
if old_txt == txt:
print('[INFO] Exiting, no update found.')
sys.exit(0)
with open(DL_FILE, 'w', encoding='utf-8') as f:
f.write(txt)
def parse_page(page_num: int, latest_post_link: str = None):
params['page'] = page_num
url = build_url_with_params(BASE_URL, params)
txt = scrape_page(url)
# print(txt)
soup = BeautifulSoup(txt, 'html.parser')
table = soup.find_all('table', class_='grid')[1]
rows = table.find_all('tr')[1:] # skip header row
flg = [False] * 2
entries = []
for (i, tr) in enumerate(rows):
cols = tr.find_all('td')
if len(cols) < 2:
continue
a_post = cols[0].find('a')
raw_title = a_post.text.strip()
if not raw_title.endswith(' epub'):
continue
post_title = raw_title[:-5] if raw_title.endswith(' epub') else raw_title
post_link = a_post['href'] if a_post['href'].startswith('http') else urljoin(DOMAIN, a_post['href'])
# 检查是否解析到已存在的最新帖子
if latest_post_link is not None and post_link == latest_post_link:
return entries, True # 返回当前已收集的entries,并标记停止
a_novel = cols[1].find('a')
novel_title = a_novel.text.strip()
novel_link = urljoin(DOMAIN, a_novel['href'])
if not flg[0] and novel_link.endswith('/2751.htm'):
novel_title = '我们不可能成为恋人!绝对不行。(※似乎可行?)(我怎么可能成为你的恋人,不行不行!)'
flg[0] = True
if not flg[1] and novel_link.endswith('/3828.htm'):
novel_title = 'Tier1姐妹 有名四姐妹没我就活不下去'
flg[1] = True
post_title = '"' + post_title + '"'
novel_title = '"' + novel_title + '"'
entries.append([post_title, post_link, novel_title, novel_link])
if page_num == 1 and i == 0:
get_latest(get_latest_url(post_link))
if page_num == 1:
last = soup.find('a', class_='last')
global last_page
last_page = int(last.text) if last else 1
return entries, False
def scrape():
# 获取POST_LIST_FILE中第一个post_link
latest_post_link = None
try:
with open(POST_LIST_FILE, 'r', encoding='utf-8') as f:
next(f) # skip header
first_line = next(f, '').strip()
if first_line:
latest_post_link = first_line.split(',')[1]
file_exists = True
except FileNotFoundError:
file_exists = False
all_entries = []
stop = False
# 先爬第一页
print('[INFO] scrape (1)')
entries, found = parse_page(1, latest_post_link)
all_entries.extend(entries)
stop = found
# 继续爬剩余页数,直到遇到已存在帖子
page = 2
while not stop and page <= last_page:
print(f'[INFO] scrape ({page}/{last_page})')
entries, found = parse_page(page, latest_post_link)
all_entries.extend(entries)
stop = found
if stop:
break
page += 1
time.sleep(random.uniform(1, 3))
if _scraper == 'steel':
exit_steel() # close Steel session
# 新内容在前,拼接后写入
# with open(POST_LIST_FILE, 'w', encoding='utf-8', newline='') as f:
# f.write('post_title,post_link,novel_title,novel_link\n')
# for entry in all_entries:
# f.write(','.join(entry) + '\n')
if not file_exists:
with open(POST_LIST_FILE, 'w', encoding='utf-8', newline='') as f:
f.write('post_title,post_link,novel_title,novel_link\n')
for entry in all_entries:
f.write(','.join(entry) + '\n')
else:
with open(POST_LIST_FILE, 'r+', encoding='utf-8', newline='') as f:
# insert between header and first line
lines = f.readlines()
lines = lines[:1] + [','.join(entry) + '\n' for entry in all_entries] + lines[1:]
f.seek(0)
f.writelines(lines)
# ========== Data Processing ==========
def purify(text: str) -> str: # 只保留中文、英文和数字
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text)
return text
CN_NUM = { '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10 }
def chinese_to_arabic(cn: str) -> int:
if cn == '十':
return 10
elif cn.startswith('十'):
return 10 + CN_NUM.get(cn[1], 0)
elif cn.endswith('十'):
return CN_NUM.get(cn[0], 0) * 10
elif '十' in cn:
parts = cn.split('十')
return CN_NUM.get(parts[0], 0) * 10 + CN_NUM.get(parts[1], 0)
else:
return CN_NUM.get(cn, 0)
def replace_chinese_numerals(s: str) -> str:
match = re.search(r'第([一二三四五六七八九十零]{1,3})卷', s)
if match:
cn_num = match.group(1)
arabic_num = chinese_to_arabic(cn_num)
s = s.replace(cn_num, f' {arabic_num} ')
match = re.search(r'第 (\S+) 卷', s)
if match:
s = s.replace('第 ', '')
s = s.replace(' 卷', '')
return s
IGNORED_TITLES = ['时间', '少女', '再见宣言', '强袭魔女', '秋之回忆', '秋之回忆2', '魔王', '青梅竹马', '弹珠汽水']
def merge():
df_post = pd.read_csv(POST_LIST_FILE, encoding='utf-8')
df_post.drop_duplicates(subset=['novel_title'], keep='first', inplace=True)
df_post.reset_index(drop=True, inplace=True)
df_post['volume'] = df_post['post_title'].apply(replace_chinese_numerals)
# df_post['post_main'] = df_post['novel_title'].apply(lambda x: x[:x.rfind('(')] if x[-1] == ')' else x)
df_post['post_alt'] = df_post['novel_title'].apply(lambda x: x[x.rfind('(')+1:-1] if x[-1] == ')' else "")
df_post['post_pure'] = df_post['novel_title'].apply(purify)
df_post['post_alt_pure'] = df_post['post_alt'].apply(purify)
df_post.drop(columns=['post_title'], inplace=True)
df_post['dl_label'] = ""
df_post['dl_pwd'] = ""
df_post['dl_update'] = ""
df_post['dl_remark'] = ""
df_post['txt_matched'] = False
# merge dl to post
with open(DL_FILE, 'r', encoding='utf-8') as f:
global _prefix
_ = f.readlines()
# <html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;"> 网址前缀:wenku8.lanzov.com/
_prefix = _[0].split(':')[-1].strip()
# print(f"[DEBUG] DL prefix: {_prefix}")
lines = _[2:]
for line in lines:
parts = line.strip().split()
if len(parts) < 4:
continue
mask = df_post['post_pure'].str.match(purify(parts[-1]))
if mask.any():
df_post.loc[mask, 'dl_update'] = parts[0]
df_post.loc[mask, 'dl_label'] = parts[1]
df_post.loc[mask, 'dl_pwd'] = parts[2]
if len(parts) > 4:
if parts[3][:2] == '更新' or parts[3][:2] == '补全':
df_post.loc[mask, 'dl_remark'] = parts[3][2:]
# if mask.sum() > 1:
# print(f'[WARN] {mask.sum()} entries matched for {parts[3]}')
# else:
# print(f'[WARN] Failed to match {parts[3]}')
# merge post to txt
df_txt = pd.read_csv(TXT_LIST_FILE, encoding='utf-8')
df_txt['txt_pure'] = df_txt['title'].apply(purify) # 4
df_txt['volume'] = '' # 5
df_txt['dl_label'] = '' # 6
df_txt['dl_pwd'] = '' # 7
df_txt['dl_update'] = None # 8
df_txt['dl_remark'] = '' # 9
df_txt['novel_title'] = '' # 10
df_txt['novel_link'] = '' # 11
for i in range(len(df_txt)):
_title = df_txt.iloc[i, 0]
if _title in IGNORED_TITLES:
continue
mask = df_post['post_pure'].str.match(df_txt.iloc[i, 4]) & (df_post['txt_matched'] == False)
match = None
if mask.any():
match = mask[mask].index[0]
# if mask.sum() > 1:
# print(f'[WARN] {mask.sum()} entries matched for {_title}')
# for j in range(len(df_post)):
# if mask[j]:
# print(f' {df_post.iloc[j]["novel_title"]}')
else:
mask = df_post['post_alt_pure'].str.match(df_txt.iloc[i, 4]) & (df_post['txt_matched'] == False)
if mask.any():
match = mask[mask].index[0]
# if mask.sum() > 1:
# print(f'[WARN] {mask.sum()} entries matched for {_title}')
# for j in range(len(df_post)):
# if mask[j]:
# print(f' {df_post.iloc[j]["novel_title"]}')
if match is not None:
df_txt.iloc[i, 5] = df_post.iloc[match]['volume']
df_txt.iloc[i, 6] = df_post.iloc[match]['dl_label']
df_txt.iloc[i, 7] = df_post.iloc[match]['dl_pwd']
df_txt.iloc[i, 8] = df_post.iloc[match]['dl_update']
df_txt.iloc[i, 9] = df_post.iloc[match]['dl_remark']
df_txt.iloc[i, 10] = df_post.iloc[match]['novel_title']
df_txt.iloc[i, 11] = df_post.iloc[match]['novel_link']
df_post.iloc[match, -1] = True
_mask = df_post['txt_matched'] == False
for y in df_post[_mask].itertuples():
if y.dl_label == "":
continue
df_txt.loc[len(df_txt)] = ["", "", None, "", "", y.volume, y.dl_label, y.dl_pwd, y.dl_update, y.dl_remark, y.novel_title, y.novel_link]
df_txt['title'] = df_txt.apply(lambda x: x['novel_title'] if x['novel_title'] else x['title'], axis=1)
df_txt['update'] = df_txt.apply(lambda x: x['dl_update'] if x['dl_update'] else x['date'], axis=1)
df_txt['main'] = df_txt['title'].apply(lambda x: x[:x.rfind('(')] if x[-1] == ')' else x)
df_txt['alt'] = df_txt['title'].apply(lambda x: x[x.rfind('(')+1:-1] if x[-1] == ')' else "")
df_txt.drop(columns=['title', 'date', 'txt_pure', 'novel_title'], inplace=True)
df_txt.sort_values(by=['update'], ascending=False, inplace=True)
df_txt.to_csv(MERGED_CSV, index=False, encoding='utf-8-sig')
# ========== HTML Generation ==========
starme = '<iframe style="margin-left: 2px; margin-bottom:-5px;" frameborder="0" scrolling="0" width="81px" height="20px" src="https://ghbtns.com/github-btn.html?user=mojimoon&repo=wenku8&type=star&count=true" ></iframe>'
def create_table_merged(df):
rows = []
for _, row in df.iterrows():
_l, _m, _a, _txt, _dll, _u, _at, _v, _r = row['novel_link'], row['main'], row['alt'], row['download_url'], row['dl_label'], row['update'], row['author'], row['volume'], row['dl_remark']
novel_link = None if pd.isna(_l) else _l
title_html = f'<a href="{novel_link}" target="_blank">{_m}</a>' if novel_link else _m
alt_html = '' if pd.isna(_a) else f"<span class='at'>{_a}</span>"
txt_dl = '' if pd.isna(_txt) else f"<a href='{_txt}' target='_blank'>下载</a> <a href='https://ghfast.top/{_txt}' target='_blank'>镜像</a>"
volume = '' if pd.isna(_v) else f'({_v})'
remark = '' if pd.isna(_r) else f" <span class='bt'>{_r}</span>"
lz_dl = '' if pd.isna(_dll) else f"<a href='https://{_prefix}/{_dll}' target='_blank'>{volume}</a>{remark}"
date = '' if pd.isna(_u) else _u
author = '' if pd.isna(_at) else _at
lz_pwd = '' if pd.isna(_dll) else row['dl_pwd']
rows.append(
f"<tr><td>{title_html}{alt_html}</td>"
f"<td class='au'>{author}</td><td>{lz_dl}</td><td>{lz_pwd}</td>"
f"<td class='dl'>{txt_dl}</td><td class='yd'>{date}</td></tr>"
)
return ''.join(rows)
def create_html_merged():
df = pd.read_csv(MERGED_CSV, encoding='utf-8-sig')
table = create_table_merged(df)
today = time.strftime('%Y-%m-%d', time.localtime())
html = (
'<!DOCTYPE html><html lang="zh-CN"><head><meta charset="UTF-8">'
'<meta name="viewport"content="width=device-width,initial-scale=1.0">'
'<meta name="keywords"content="轻小说,sf轻小说,dmzj轻小说,日本轻小说,动漫小说,轻小说电子书,轻小说EPUB下载">'
'<meta name="description"content="轻小说文库 EPUB 下载,支持搜索关键字、跳转至源站和蓝奏云下载,已进行移动端适配。">'
'<meta name="author"content="mojimoon"><title>轻小说文库 EPUB 下载+</title>'
f'<link rel="stylesheet"href="{JSDELIVR_CDN}style.css"></head><body>'
'<h1 onclick="window.location.reload()">轻小说文库 EPUB 下载+</h1>'
f'<h4>({today}) <a href="https://github.com/mojimoon">mojimoon</a>/<a href="https://github.com/mojimoon/wenku8">wenku8</a> {starme}</h4>'
'<span>所有内容均收集于网络,仅供学习交流使用。'
'特别感谢 <a href="https://www.wenku8.net/modules/article/reviewslist.php?keyword=8691&charset=utf-8">酷儿加冰</a> 和 <a href="https://github.com/ixinzhi">布客新知</a> 整理。</span>'
'<span class="at">最新为 Calibre 生成 EPUB,括号内为最新卷数;年更为纯文本 EPUB。</span>'
'<div class="right-controls"><a href="./epub.html">'
'<button class="btn"id="gotoButton">切换到仅 EPUB 源,加载更快</button></a>'
'<button class="btn"id="themeToggle">主题</button>'
'<button class="btn"id="clearInput">清除</button></div>'
'<div class="search-bar"><input type="text"id="searchInput"placeholder="搜索标题或作者">'
'<button class="btn"id="randomButton">随机</button></div>'
'<table><thead><tr><th>标题</th><th>作者</th><th>最新</th><th>密码</th><th>年更</th><th>更新</th></tr>'
'</thead><tbody id="novelTableBody">'
f'{table}</tbody></table><script src="{JSDELIVR_CDN}script_merged.js"></script>'
'</body></html>'
)
with open(MERGED_HTML, 'w', encoding='utf-8') as f:
f.write(html)
def create_table_epub(df):
rows = []
for _, row in df.iterrows():
_l, _m, _a, _dll, _at, _v, _r = row['novel_link'], row['main'], row['alt'], row['dl_label'], row['author'], row['volume'], row['dl_remark']
novel_link = None if pd.isna(_l) else _l
title_html = f'<a href="{novel_link}" target="_blank">{_m}</a>' if novel_link else _m
alt_html = '' if pd.isna(_a) else f"<span class='at'>{_a}</span>"
volume = '' if pd.isna(_v) else f'({_v})'
remark = '' if pd.isna(_r) else f" <span class='bt'>{_r}</span>"
lz_dl = '' if pd.isna(_dll) else f"<a href='https://{_prefix}/{_dll}' target='_blank'>{volume}</a>{remark}"
author = '' if pd.isna(_at) else _at
rows.append(
f"<tr><td>{title_html}{alt_html}</td>"
f"<td class='au'>{author}</td><td>{lz_dl}</td><td>{row['dl_pwd']}</td>"
f"<td class='yd'>{row['update']}</td></tr>"
)
return ''.join(rows)
def create_html_epub():
df = pd.read_csv(MERGED_CSV, encoding='utf-8-sig')
df = df[df['dl_label'].notna()]
table = create_table_epub(df)
today = time.strftime('%Y-%m-%d', time.localtime())
html = (
'<!DOCTYPE html><html lang="zh-CN"><head><meta charset="UTF-8">'
'<meta name="viewport"content="width=device-width,initial-scale=1.0">'
'<meta name="keywords"content="轻小说,sf轻小说,dmzj轻小说,日本轻小说,动漫小说,轻小说电子书,轻小说EPUB下载">'
'<meta name="description"content="轻小说文库 EPUB 下载,支持搜索关键字、跳转至源站和蓝奏云下载,已进行移动端适配。">'
'<meta name="author"content="mojimoon"><title>轻小说文库 EPUB 下载</title>'
f'<link rel="stylesheet"href="{JSDELIVR_CDN}style.css"></head><body>'
'<h1 onclick="window.location.reload()">轻小说文库 EPUB 下载</h1>'
f'<h4>({today}) <a href="https://github.com/mojimoon">mojimoon</a>/<a href="https://github.com/mojimoon/wenku8">wenku8</a> {starme}</h4>'
'<span>所有内容均收集于网络,仅供学习交流使用。'
'特别感谢 <a href="https://www.wenku8.net/modules/article/reviewslist.php?keyword=8691&charset=utf-8">酷儿加冰</a> 整理。括号内为最新卷数。</span>'
'<div class="right-controls"><a href="./index.html">'
'<button class="btn"id="gotoButton">切换到 EPUB/TXT 源,内容更全</button></a>'
'<button class="btn"id="themeToggle">主题</button>'
'<button class="btn"id="clearInput">清除</button></div>'
'<div class="search-bar"><input type="text"id="searchInput"placeholder="搜索标题或作者">'
'<button class="btn"id="randomButton">随机</button></div>'
'<table><thead><tr><th>标题</th><th>作者</th><th>蓝奏</th><th>密码</th><th>更新</th></tr>'
'</thead><tbody id="novelTableBody">'
f'{table}</tbody></table><script src="{JSDELIVR_CDN}script_merged.js"></script>'
'</body></html>'
)
with open(EPUB_HTML, 'w', encoding='utf-8') as f:
f.write(html)
def main():
if not os.path.exists(OUT_DIR):
os.mkdir(OUT_DIR)
if not os.path.exists(PUBLIC_DIR):
os.mkdir(PUBLIC_DIR)
scrape()
merge()
create_html_merged()
create_html_epub()
if __name__ == '__main__':
if len(sys.argv) > 1:
_scraper = sys.argv[1]
print(f'[INFO] Using scraper: {_scraper}')
main()