wenku8/main.py at main · mojimoon/wenku8 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
from urllib.parse import urljoin
import sys
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
import json
import os
import pandas as pd
import sys

BASE_URL = 'https://www.wenku8.net/modules/article/reviewslist.php'
params = { 'keyword': '8691', 'charset': 'utf-8', 'page': 1 }
# 'requests' | 'playwright' | 'steel'
_scraper = 'steel'
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
]
HEADERS = {
    # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'User-Agent': random.choice(user_agents),
    'Referer': 'https://www.wenku8.net/',
}
DOMAIN = 'https://www.wenku8.net'
JSDELIVR_CDN = 'https://gcore.jsdelivr.net/gh/mojimoon/wenku8@gh-pages/'
OUT_DIR = 'out'
PUBLIC_DIR = 'docs'
COOKIE_FILE = os.path.join(os.path.dirname(__file__), 'COOKIE')
POST_LIST_FILE = os.path.join(OUT_DIR, 'post_list.csv')
TXT_LIST_FILE = os.path.join(OUT_DIR, 'txt_list.csv')
DL_FILE = os.path.join(OUT_DIR, 'dl.txt')
MERGED_CSV = os.path.join(OUT_DIR, 'merged.csv')
EPUB_HTML = os.path.join(PUBLIC_DIR, 'epub.html')
MERGED_HTML = os.path.join(PUBLIC_DIR, 'index.html')

retry_strategy = Retry(
    total=5,
    status_forcelist=[500, 502, 503, 504],
    backoff_factor=2
)
session = requests.Session()
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.headers.update(HEADERS)

def parse_cookie_line(line: str):
    line = line.strip()
    if not line:
        return {}
    cookie_dict = {}
    for part in line.split(';'):
        part = part.strip()
        if not part or '=' not in part:
            continue
        k, v = part.split('=', 1)
        cookie_dict[k.strip()] = v.strip()
    return cookie_dict

def load_cookie_from_file(sess: requests.Session, filepath: str):
    if not os.path.exists(filepath):
        return
    with open(filepath, 'r', encoding='utf-8') as f:
        # 只取第一行，整行都是 "k1=v1; k2=v2; ..."
        line = f.readline()
    cookie_dict = parse_cookie_line(line)
    if cookie_dict:
        jar = requests.utils.cookiejar_from_dict(cookie_dict)
        sess.cookies.update(jar)

load_cookie_from_file(session, COOKIE_FILE)

browser = None
playwright_ctx_cookie_dict = None
steel_dict = None

def init_playwright():
    from playwright.sync_api import sync_playwright
    global browser, playwright_ctx_cookie_dict
    if browser is None:
        playwright = sync_playwright().start()
        browser = playwright.chromium.launch(
            headless=True,
            args=['--no-sandbox', '--disable-setuid-sandbox']
        )
        # 预解析 COOKIE_FILE，供后面 new_context 使用
        if os.path.exists(COOKIE_FILE):
            with open(COOKIE_FILE, 'r', encoding='utf-8') as f:
                line = f.readline()
            playwright_ctx_cookie_dict = parse_cookie_line(line)
        else:
            playwright_ctx_cookie_dict = {}
    return browser

def init_steel():
    from steel import Steel
    from dotenv import dotenv_values
    from playwright.sync_api import sync_playwright
    global browser, playwright_ctx_cookie_dict, steel_dict
    steel_api_key = dotenv_values().get('STEEL_API_KEY', '')
    client = Steel(steel_api_key=steel_api_key)
    steel_session = client.sessions.create(api_timeout=40000)
    print(f'[INFO] Running Steel session: {steel_session.id}')
    steel_dict = {
        'api_key': steel_api_key,
        'session_id': steel_session.id,
        'client': client
    }

    if browser is None:
        playwright = sync_playwright().start()
        browser = playwright.chromium.connect_over_cdp(
            f'wss://connect.steel.dev?apiKey={steel_api_key}&sessionId={steel_session.id}'
        )

        if os.path.exists(COOKIE_FILE):
            with open(COOKIE_FILE, 'r', encoding='utf-8') as f:
                line = f.readline()
            playwright_ctx_cookie_dict = parse_cookie_line(line)
        else:
            playwright_ctx_cookie_dict = {}
    return browser

def exit_steel():
    browser.close()
    client = steel_dict['client']
    session_id = steel_dict['session_id']
    client.sessions.release(session_id)

def scrape_page_playwright(url: str):
    global browser, playwright_ctx_cookie_dict
    if browser is None:
        browser = (init_steel() if _scraper == 'steel' else init_playwright())
    # 每次新建 context，并注入 cookie
    with browser.new_context() as context:
        if playwright_ctx_cookie_dict:
            cookies = [
                {
                    "name": k,
                    "value": v,
                    "domain": "www.wenku8.net",
                    "path": "/",
                    # 可按需设置 "httpOnly" / "secure" / "sameSite"
                }
                for k, v in playwright_ctx_cookie_dict.items()
            ]
            context.add_cookies(cookies)
        page = context.new_page()
        try:
            page.goto(url, wait_until='domcontentloaded')
        except Exception as e:
            print(f"[WARN] Page.goto encountered an error or timeout, attempting to proceed: {e}")

        if "/login.php" in page.url:
            raise ValueError(f"[ERROR] Playwright 模式被重定向到登录页，可能需要更新 COOKIE 文件: {page.url}")
        html_content = page.content()
        page.close()
    return html_content

def scrape_page_requests(url: str):
    resp = session.get(url, timeout=10, allow_redirects=True)
    final_url = resp.url
    if '/login.php' in final_url:
        raise ValueError(f"[ERROR] Requests 模式被重定向到登录页，可能需要更新 COOKIE 文件: {final_url}")
    resp.raise_for_status()
    resp.encoding = 'utf-8'
    # with open('debug.html', 'w', encoding='utf-8') as f:
    #     f.write(resp.text)
    return resp.text

def scrape_page(url: str):
    if _scraper == 'playwright' or _scraper == 'steel':
        return scrape_page_playwright(url)
    elif _scraper == 'requests':
        return scrape_page_requests(url)
    else:
        raise ValueError(f"Unknown _scraper: {_scraper}")

def build_url_with_params(base_url: str, params: dict):
    if not params:
        return base_url
    query_string = '&'.join(f"{key}={value}" for key, value in params.items())
    # print(f'[DEBUG] Built URL: {base_url}?{query_string}')
    return f"{base_url}?{query_string}"

# ========== Scraping ==========
last_page = 1
def get_latest_url(post_link: str):
    txt = scrape_page(post_link)

    # <a href="https://paste.gentoo.zip" target="_blank">https://paste.gentoo.zip</a>/EsX5Kx8V
    match = re.search(r'<a href="([^"]+)" target="_blank">([^<]+)</a>(/[^<]+)', txt)
    link = match.group(1) + match.group(3) if match else None
    if link is None:
        # <a href="https://0x0.st/8QWZ.txt" target="_blank">https://0x0.st/8QWZ.txt</a><br>
        match = re.search(r'https:\/\/[^"]+?\.txt(?=")', txt)
        if match:
            link = match.group(0)
        else:
            raise ValueError("[ERROR] Failed to find the latest URL")

    return link

def get_latest(url: str):
    txt = scrape_page(url)
    lines = txt.split('\n')
    flg = [False] * 4
    for i in range(len(lines)):
        if not flg[0] and lines[i].endswith('_杂志连载版'):
            lines[i] = lines[i].replace('_杂志连载版', '')
            flg[0] = True
        elif not flg[1] and lines[i].endswith('_SS'):
            lines[i] = lines[i].replace('_SS', '')
            flg[1] = True
        elif not flg[2] and lines[i].endswith('-Ordinary_days-'):
            lines[i] = lines[i].replace('-Ordinary_days-', ' 莉可丽丝 Ordinary days')
            flg[2] = True
        elif not flg[3] and lines[i].endswith('君若星辰'):
            lines[i] = lines[i].replace('君若星辰', '宛如星辰的你')
            flg[3] = True

    txt = '\n'.join(lines)
    # if the content has not changed, exit
    if os.path.exists(DL_FILE):
        with open(DL_FILE, 'r', encoding='utf-8') as f:
            old_txt = f.read()
        if old_txt == txt:
            print('[INFO] Exiting, no update found.')
            sys.exit(0)

    with open(DL_FILE, 'w', encoding='utf-8') as f:
        f.write(txt)

def parse_page(page_num: int, latest_post_link: str = None):
    params['page'] = page_num
    url = build_url_with_params(BASE_URL, params)
    txt = scrape_page(url)
    # print(txt)
    soup = BeautifulSoup(txt, 'html.parser')
    table = soup.find_all('table', class_='grid')[1]
    rows = table.find_all('tr')[1:]  # skip header row

    flg = [False] * 2
    entries = []
    for (i, tr) in enumerate(rows):
        cols = tr.find_all('td')
        if len(cols) < 2:
            continue
        a_post = cols[0].find('a')
        raw_title = a_post.text.strip()
        if not raw_title.endswith(' epub'):
            continue
        post_title = raw_title[:-5] if raw_title.endswith(' epub') else raw_title
        post_link = a_post['href'] if a_post['href'].startswith('http') else urljoin(DOMAIN, a_post['href'])

        # 检查是否解析到已存在的最新帖子
        if latest_post_link is not None and post_link == latest_post_link:
            return entries, True  # 返回当前已收集的entries，并标记停止

        a_novel = cols[1].find('a')
        novel_title = a_novel.text.strip()
        novel_link = urljoin(DOMAIN, a_novel['href'])
        if not flg[0] and novel_link.endswith('/2751.htm'):
            novel_title = '我们不可能成为恋人！绝对不行。（※似乎可行？）(我怎么可能成为你的恋人，不行不行！)'
            flg[0] = True
        if not flg[1] and novel_link.endswith('/3828.htm'):
            novel_title = 'Tier1姐妹 有名四姐妹没我就活不下去'
            flg[1] = True

        post_title = '"' + post_title + '"'
        novel_title = '"' + novel_title + '"'
        entries.append([post_title, post_link, novel_title, novel_link])

        if page_num == 1 and i == 0:
            get_latest(get_latest_url(post_link))

    if page_num == 1:
        last = soup.find('a', class_='last')
        global last_page
        last_page = int(last.text) if last else 1
    return entries, False

def scrape():
    # 获取POST_LIST_FILE中第一个post_link
    latest_post_link = None
    try:
        with open(POST_LIST_FILE, 'r', encoding='utf-8') as f:
            next(f)  # skip header
            first_line = next(f, '').strip()
            if first_line:
                latest_post_link = first_line.split(',')[1]
            file_exists = True
    except FileNotFoundError:
        file_exists = False

    all_entries = []
    stop = False

    # 先爬第一页
    print('[INFO] scrape (1)')
    entries, found = parse_page(1, latest_post_link)
    all_entries.extend(entries)
    stop = found

    # 继续爬剩余页数，直到遇到已存在帖子
    page = 2
    while not stop and page <= last_page:
        print(f'[INFO] scrape ({page}/{last_page})')
        entries, found = parse_page(page, latest_post_link)
        all_entries.extend(entries)
        stop = found
        if stop:
            break
        page += 1
        time.sleep(random.uniform(1, 3))

    if _scraper == 'steel':
        exit_steel() # close Steel session

    # 新内容在前，拼接后写入
    # with open(POST_LIST_FILE, 'w', encoding='utf-8', newline='') as f:
    #     f.write('post_title,post_link,novel_title,novel_link\n')
    #     for entry in all_entries:
    #         f.write(','.join(entry) + '\n')
    if not file_exists:
        with open(POST_LIST_FILE, 'w', encoding='utf-8', newline='') as f:
            f.write('post_title,post_link,novel_title,novel_link\n')
            for entry in all_entries:
                f.write(','.join(entry) + '\n')
    else:
        with open(POST_LIST_FILE, 'r+', encoding='utf-8', newline='') as f:
            # insert between header and first line
            lines = f.readlines()
            lines = lines[:1] + [','.join(entry) + '\n' for entry in all_entries] + lines[1:]
            f.seek(0)
            f.writelines(lines)

# ========== Data Processing ==========
def purify(text: str) -> str: # 只保留中文、英文和数字
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text)
    return text

CN_NUM = { '零': 0, '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10 }

def chinese_to_arabic(cn: str) -> int:
    if cn == '十':
        return 10
    elif cn.startswith('十'):
        return 10 + CN_NUM.get(cn[1], 0)
    elif cn.endswith('十'):
        return CN_NUM.get(cn[0], 0) * 10
    elif '十' in cn:
        parts = cn.split('十')
        return CN_NUM.get(parts[0], 0) * 10 + CN_NUM.get(parts[1], 0)
    else:
        return CN_NUM.get(cn, 0)

def replace_chinese_numerals(s: str) -> str:
    match = re.search(r'第([一二三四五六七八九十零]{1,3})卷', s)
    if match:
        cn_num = match.group(1)
        arabic_num = chinese_to_arabic(cn_num)
        s = s.replace(cn_num, f' {arabic_num} ')
    match = re.search(r'第 (\S+) 卷', s)
    if match:
        s = s.replace('第 ', '')
        s = s.replace(' 卷', '')
    return s

IGNORED_TITLES = ['时间', '少女', '再见宣言', '强袭魔女', '秋之回忆', '秋之回忆2', '魔王', '青梅竹马', '弹珠汽水']

def merge():
    df_post = pd.read_csv(POST_LIST_FILE, encoding='utf-8')
    df_post.drop_duplicates(subset=['novel_title'], keep='first', inplace=True)
    df_post.reset_index(drop=True, inplace=True)
    df_post['volume'] = df_post['post_title'].apply(replace_chinese_numerals)
    # df_post['post_main'] = df_post['novel_title'].apply(lambda x: x[:x.rfind('(')] if x[-1] == ')' else x)
    df_post['post_alt'] = df_post['novel_title'].apply(lambda x: x[x.rfind('(')+1:-1] if x[-1] == ')' else "")
    df_post['post_pure'] = df_post['novel_title'].apply(purify)
    df_post['post_alt_pure'] = df_post['post_alt'].apply(purify)
    df_post.drop(columns=['post_title'], inplace=True)

    df_post['dl_label'] = ""
    df_post['dl_pwd'] = ""
    df_post['dl_update'] = ""
    df_post['dl_remark'] = ""
    df_post['txt_matched'] = False

    # merge dl to post
    with open(DL_FILE, 'r', encoding='utf-8') as f:
        global _prefix
        _ = f.readlines()
        # <html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;"> 网址前缀：wenku8.lanzov.com/
        _prefix = _[0].split('：')[-1].strip()
        # print(f"[DEBUG] DL prefix: {_prefix}")
        lines = _[2:]
        for line in lines:
            parts = line.strip().split()
            if len(parts) < 4:
                continue
            mask = df_post['post_pure'].str.match(purify(parts[-1]))
            if mask.any():
                df_post.loc[mask, 'dl_update'] = parts[0]
                df_post.loc[mask, 'dl_label'] = parts[1]
                df_post.loc[mask, 'dl_pwd'] = parts[2]
                if len(parts) > 4:
                    if parts[3][:2] == '更新' or parts[3][:2] == '补全':
                        df_post.loc[mask, 'dl_remark'] = parts[3][2:]
            #     if mask.sum() > 1:
            #         print(f'[WARN] {mask.sum()} entries matched for {parts[3]}')
            # else:
            #     print(f'[WARN] Failed to match {parts[3]}')

    # merge post to txt
    df_txt = pd.read_csv(TXT_LIST_FILE, encoding='utf-8')
    df_txt['txt_pure'] = df_txt['title'].apply(purify) # 4
    df_txt['volume'] = '' # 5
    df_txt['dl_label'] = '' # 6
    df_txt['dl_pwd'] = '' # 7
    df_txt['dl_update'] = None # 8
    df_txt['dl_remark'] = '' # 9
    df_txt['novel_title'] = '' # 10
    df_txt['novel_link'] = '' # 11
    for i in range(len(df_txt)):
        _title = df_txt.iloc[i, 0]
        if _title in IGNORED_TITLES:
            continue
        mask = df_post['post_pure'].str.match(df_txt.iloc[i, 4]) & (df_post['txt_matched'] == False)
        match = None
        if mask.any():
            match = mask[mask].index[0]
            # if mask.sum() > 1:
            #     print(f'[WARN] {mask.sum()} entries matched for {_title}')
            #     for j in range(len(df_post)):
            #         if mask[j]:
            #             print(f'    {df_post.iloc[j]["novel_title"]}')
        else:
            mask = df_post['post_alt_pure'].str.match(df_txt.iloc[i, 4]) & (df_post['txt_matched'] == False)
            if mask.any():
                match = mask[mask].index[0]
                # if mask.sum() > 1:
                #     print(f'[WARN] {mask.sum()} entries matched for {_title}')
                #     for j in range(len(df_post)):
                #         if mask[j]:
                #             print(f'    {df_post.iloc[j]["novel_title"]}')
        if match is not None:
            df_txt.iloc[i, 5] = df_post.iloc[match]['volume']
            df_txt.iloc[i, 6] = df_post.iloc[match]['dl_label']
            df_txt.iloc[i, 7] = df_post.iloc[match]['dl_pwd']
            df_txt.iloc[i, 8] = df_post.iloc[match]['dl_update']
            df_txt.iloc[i, 9] = df_post.iloc[match]['dl_remark']
            df_txt.iloc[i, 10] = df_post.iloc[match]['novel_title']
            df_txt.iloc[i, 11] = df_post.iloc[match]['novel_link']
            df_post.iloc[match, -1] = True

    _mask = df_post['txt_matched'] == False
    for y in df_post[_mask].itertuples():
        if y.dl_label == "":
            continue
        df_txt.loc[len(df_txt)] = ["", "", None, "", "", y.volume, y.dl_label, y.dl_pwd, y.dl_update, y.dl_remark, y.novel_title, y.novel_link]

    df_txt['title'] = df_txt.apply(lambda x: x['novel_title'] if x['novel_title'] else x['title'], axis=1)
    df_txt['update'] = df_txt.apply(lambda x: x['dl_update'] if x['dl_update'] else x['date'], axis=1)
    df_txt['main'] = df_txt['title'].apply(lambda x: x[:x.rfind('(')] if x[-1] == ')' else x)
    df_txt['alt'] = df_txt['title'].apply(lambda x: x[x.rfind('(')+1:-1] if x[-1] == ')' else "")
    df_txt.drop(columns=['title', 'date', 'txt_pure', 'novel_title'], inplace=True)
    df_txt.sort_values(by=['update'], ascending=False, inplace=True)
    df_txt.to_csv(MERGED_CSV, index=False, encoding='utf-8-sig')

# ========== HTML Generation ==========
starme = '<iframe style="margin-left: 2px; margin-bottom:-5px;" frameborder="0" scrolling="0" width="81px" height="20px" src="https://ghbtns.com/github-btn.html?user=mojimoon&repo=wenku8&type=star&count=true" ></iframe>'
def create_table_merged(df):
    rows = []
    for _, row in df.iterrows():
        _l, _m, _a, _txt, _dll, _u, _at, _v, _r = row['novel_link'], row['main'], row['alt'], row['download_url'], row['dl_label'], row['update'], row['author'], row['volume'], row['dl_remark']
        novel_link = None if pd.isna(_l) else _l
        title_html = f'<a href="{novel_link}" target="_blank">{_m}</a>' if novel_link else _m
        alt_html = '' if pd.isna(_a) else f"<span class='at'>{_a}</span>"
        txt_dl = '' if pd.isna(_txt) else f"<a href='{_txt}' target='_blank'>下载</a> <a href='https://ghfast.top/{_txt}' target='_blank'>镜像</a>"
        volume = '' if pd.isna(_v) else f'({_v})'
        remark = '' if pd.isna(_r) else f" <span class='bt'>{_r}</span>"
        lz_dl = '' if pd.isna(_dll) else f"<a href='https://{_prefix}/{_dll}' target='_blank'>{volume}</a>{remark}"
        date = '' if pd.isna(_u) else _u
        author = '' if pd.isna(_at) else _at
        lz_pwd = '' if pd.isna(_dll) else row['dl_pwd']
        rows.append(
            f"<tr><td>{title_html}{alt_html}</td>"
            f"<td class='au'>{author}</td><td>{lz_dl}</td><td>{lz_pwd}</td>"
            f"<td class='dl'>{txt_dl}</td><td class='yd'>{date}</td></tr>"
        )
    return ''.join(rows)

def create_html_merged():
    df = pd.read_csv(MERGED_CSV, encoding='utf-8-sig')
    table = create_table_merged(df)
    today = time.strftime('%Y-%m-%d', time.localtime())
    html = (
        '<!DOCTYPE html><html lang="zh-CN"><head><meta charset="UTF-8">'
        '<meta name="viewport"content="width=device-width,initial-scale=1.0">'
        '<meta name="keywords"content="轻小说,sf轻小说,dmzj轻小说,日本轻小说,动漫小说,轻小说电子书,轻小说EPUB下载">'
        '<meta name="description"content="轻小说文库 EPUB 下载，支持搜索关键字、跳转至源站和蓝奏云下载，已进行移动端适配。">'
        '<meta name="author"content="mojimoon"><title>轻小说文库 EPUB 下载+</title>'
        f'<link rel="stylesheet"href="{JSDELIVR_CDN}style.css"></head><body>'
        '<h1 onclick="window.location.reload()">轻小说文库 EPUB 下载+</h1>'
        f'<h4>({today}) <a href="https://github.com/mojimoon">mojimoon</a>/<a href="https://github.com/mojimoon/wenku8">wenku8</a> {starme}</h4>'
        '<span>所有内容均收集于网络，仅供学习交流使用。'
        '特别感谢 <a href="https://www.wenku8.net/modules/article/reviewslist.php?keyword=8691&charset=utf-8">酷儿加冰</a> 和 <a href="https://github.com/ixinzhi">布客新知</a> 整理。</span>'
        '<span class="at">最新为 Calibre 生成 EPUB，括号内为最新卷数；年更为纯文本 EPUB。</span>'
        '<div class="right-controls"><a href="./epub.html">'
        '<button class="btn"id="gotoButton">切换到仅 EPUB 源，加载更快</button></a>'
        '<button class="btn"id="themeToggle">主题</button>'
        '<button class="btn"id="clearInput">清除</button></div>'
        '<div class="search-bar"><input type="text"id="searchInput"placeholder="搜索标题或作者">'
        '<button class="btn"id="randomButton">随机</button></div>'
        '<table><thead><tr><th>标题</th><th>作者</th><th>最新</th><th>密码</th><th>年更</th><th>更新</th></tr>'
        '</thead><tbody id="novelTableBody">'
        f'{table}</tbody></table><script src="{JSDELIVR_CDN}script_merged.js"></script>'
        '</body></html>'
    )
    with open(MERGED_HTML, 'w', encoding='utf-8') as f:
        f.write(html)

def create_table_epub(df):
    rows = []
    for _, row in df.iterrows():
        _l, _m, _a, _dll, _at, _v, _r = row['novel_link'], row['main'], row['alt'], row['dl_label'], row['author'], row['volume'], row['dl_remark']
        novel_link = None if pd.isna(_l) else _l
        title_html = f'<a href="{novel_link}" target="_blank">{_m}</a>' if novel_link else _m
        alt_html = '' if pd.isna(_a) else f"<span class='at'>{_a}</span>"
        volume = '' if pd.isna(_v) else f'({_v})'
        remark = '' if pd.isna(_r) else f" <span class='bt'>{_r}</span>"
        lz_dl = '' if pd.isna(_dll) else f"<a href='https://{_prefix}/{_dll}' target='_blank'>{volume}</a>{remark}"
        author = '' if pd.isna(_at) else _at
        rows.append(
            f"<tr><td>{title_html}{alt_html}</td>"
            f"<td class='au'>{author}</td><td>{lz_dl}</td><td>{row['dl_pwd']}</td>"
            f"<td class='yd'>{row['update']}</td></tr>"
        )
    return ''.join(rows)

def create_html_epub():
    df = pd.read_csv(MERGED_CSV, encoding='utf-8-sig')
    df = df[df['dl_label'].notna()]
    table = create_table_epub(df)
    today = time.strftime('%Y-%m-%d', time.localtime())
    html = (
        '<!DOCTYPE html><html lang="zh-CN"><head><meta charset="UTF-8">'
        '<meta name="viewport"content="width=device-width,initial-scale=1.0">'
        '<meta name="keywords"content="轻小说,sf轻小说,dmzj轻小说,日本轻小说,动漫小说,轻小说电子书,轻小说EPUB下载">'
        '<meta name="description"content="轻小说文库 EPUB 下载，支持搜索关键字、跳转至源站和蓝奏云下载，已进行移动端适配。">'
        '<meta name="author"content="mojimoon"><title>轻小说文库 EPUB 下载</title>'
        f'<link rel="stylesheet"href="{JSDELIVR_CDN}style.css"></head><body>'
        '<h1 onclick="window.location.reload()">轻小说文库 EPUB 下载</h1>'
        f'<h4>({today}) <a href="https://github.com/mojimoon">mojimoon</a>/<a href="https://github.com/mojimoon/wenku8">wenku8</a> {starme}</h4>'
        '<span>所有内容均收集于网络，仅供学习交流使用。'
        '特别感谢 <a href="https://www.wenku8.net/modules/article/reviewslist.php?keyword=8691&charset=utf-8">酷儿加冰</a> 整理。括号内为最新卷数。</span>'
        '<div class="right-controls"><a href="./index.html">'
        '<button class="btn"id="gotoButton">切换到 EPUB/TXT 源，内容更全</button></a>'
        '<button class="btn"id="themeToggle">主题</button>'
        '<button class="btn"id="clearInput">清除</button></div>'
        '<div class="search-bar"><input type="text"id="searchInput"placeholder="搜索标题或作者">'
        '<button class="btn"id="randomButton">随机</button></div>'
        '<table><thead><tr><th>标题</th><th>作者</th><th>蓝奏</th><th>密码</th><th>更新</th></tr>'
        '</thead><tbody id="novelTableBody">'
        f'{table}</tbody></table><script src="{JSDELIVR_CDN}script_merged.js"></script>'
        '</body></html>'
    )
    with open(EPUB_HTML, 'w', encoding='utf-8') as f:
        f.write(html)

def main():
    if not os.path.exists(OUT_DIR):
        os.mkdir(OUT_DIR)
    if not os.path.exists(PUBLIC_DIR):
        os.mkdir(PUBLIC_DIR)

    scrape()
    merge()
    create_html_merged()
    create_html_epub()

if __name__ == '__main__':
    if len(sys.argv) > 1:
        _scraper = sys.argv[1]
        print(f'[INFO] Using scraper: {_scraper}')
    main()