conf/rule.toml

[[websites]]
    netloc = "www.xbookcn.com"

    # 空 就是 None
    encoding = "big5"

    # body_page 是正文获取规则
    [websites.body_page]
        loader = 'aiohttp'

    [websites.body_page.title]
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = 'div[class=title_page]'
        kwargs.attributes = [['_get_text'],]  # 详见 parse_rule/lower_rule.LxmlCssSelectorRule

    [[websites.body_page.content]]
        type = 'BodyPageContentLxmlCssElementRule'
        kwargs.str_pattern = "div[class=content] > p"
        kwargs.attributes = [['_get_content'],]

    [websites.body_page.content_other_settings]
        traditional2simplified_chinese = true

    [[websites.body_page.next_page_url]]  # 原因是有的网站不光有下一章，还有下一页（更细粒度）
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = 'a[href=index\.htm] + a'
        kwargs.attributes = [['_get_attribute', 'href']]


    # index_page 是目录获取规则
    [websites.index_page]
        loader = 'aiohttp'

    [[websites.index_page.content]]  # 与正文页面统一
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = "div[class=TitleLinks] div[class=content] > a"
        kwargs.attributes = [['_get_attribute', 'href'], ['_get_text']]

    [[websites.index_page.next_page_url]]  # 与正文页面统一
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = 'none[class=none]'  # 随便写的，这里没有下一页
        kwargs.attributes = []


[[websites]]
    netloc = "m.haohengwx.com"

    [websites.body_page]
        loader = 'playwright'

    [websites.body_page.title]
        type = 'PlaywrightElementRule'
        kwargs.str_pattern = 'h1[id=chaptertitle]'
        kwargs.attributes = [['_get_text'],]

    [[websites.body_page.content]]
        type = 'BodyPageContentPlaywrightElementRule'
        kwargs.str_pattern = "div[class=articlecontent] > div > p"
        kwargs.attributes = [['_get_content_with_img', 'img', 'src'],]

    [websites.body_page.content_other_settings]
        try_combine_fake_paragraphs = true  # 不少小说网站乱切分自然段，这里根据每个“自然段”最后一个 word 尝试进行合并

    [[websites.body_page.next_page_url]]  # 原因是有的网站不光有下一章，还有下一页（更细粒度）
        type = 'PlaywrightElementRule'
        kwargs.str_pattern = '//ul/li[position()=4]/a[substring(@href, string-length(@href) - 3) = "html"][contains(text(), "下一页")]'
        kwargs.attributes = [['_get_attribute', 'href']]
    [[websites.body_page.next_page_url]]  # 原因是有的网站不光有下一章，还有下一页（更细粒度）
        type = 'PlaywrightElementRule'
        kwargs.str_pattern = '//ul/li[position()=4]/a[substring(@href, string-length(@href) - 3) = "html"][contains(text(), "下一章")]'
        kwargs.attributes = [['_get_attribute', 'href']]


    [websites.index_page]
        loader = 'aiohttp'

    [[websites.index_page.content]]  # 与正文页面统一
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = "ul[class=chapters] > li > a"
        kwargs.attributes = [['_get_attribute', 'href'], ['_get_text']]

    [[websites.index_page.next_page_url]]  # 与正文页面统一
        type = 'LxmlXpathElementRule'
        kwargs.str_pattern = '//div[@class="page"]/a[contains(text(), "下一")]'
        kwargs.attributes = [['_get_attribute', 'href']]


[[websites]]
    netloc = "www.hejixs.com"
    encoding = "utf8"

    [websites.body_page]
        loader = 'playwright'

    [websites.body_page.title]
        type = 'PlaywrightElementRule'
        kwargs.str_pattern = 'h1[class=page-title]'
        kwargs.attributes = [['_get_text'],]

    [[websites.body_page.content]]
        type = 'BodyPageContentPlaywrightElementRule'
        kwargs.str_pattern = "div[id=content] > div > p"
        kwargs.attributes = [['_get_content_with_img', 'img', 'src'],]

    [websites.body_page.content_other_settings]
        try_combine_fake_paragraphs = true  # 不少小说网站乱切分自然段，这里根据每个“自然段”最后一个 word 尝试进行合并

    [[websites.body_page.next_page_url]]  # 原因是有的网站不光有下一章，还有下一页（更细粒度）
        type = 'PlaywrightElementRule'
        kwargs.str_pattern = 'center[class=chapterPages] > a[class=curr] + a'
        kwargs.attributes = [['_get_attribute', 'href']]
    [[websites.body_page.next_page_url]]  # 原因是有的网站不光有下一章，还有下一页（更细粒度）
        type = 'PlaywrightElementRule'
        kwargs.str_pattern = 'a[class=next]'
        kwargs.attributes = [['_get_attribute', 'href']]


    [websites.index_page]
        loader = 'playwright'

    [[websites.index_page.content]]  # 与正文页面统一
        type = 'PlaywrightElementRule'
        kwargs.str_pattern = "div[class=container] > div:nth-child(9)  li > a"
        kwargs.attributes = [['_get_attribute', 'href'], ['_get_text']]

    [[websites.index_page.next_page_url]]  # 与正文页面统一
        type = 'PlaywrightClickRule'
        kwargs.str_pattern = 'a[class=nextPage]'


[[websites]]
    netloc = "www.kunnu.com"
    encoding = "utf8"

    [websites.body_page]
        loader = 'aiohttp'

    [websites.body_page.title]
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = 'h1[id=nr_title]'
        kwargs.attributes = [['_get_text'],]

    [[websites.body_page.content]]
        type = 'BodyPageContentLxmlCssElementRule'
        kwargs.str_pattern = "div[id=nr1]"
        kwargs.attributes = [['_get_content'],]

    [[websites.body_page.next_page_url]]
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = 'li[class=next] > a'
        kwargs.attributes = [['_get_attribute', 'href']]


    [websites.index_page]
        loader = 'aiohttp'

    [[websites.index_page.content]]  # 与正文页面统一
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = "div[class^=book-list]  li > a"
        kwargs.attributes = [['_get_attribute', 'href'], ['_get_text']]

    [[websites.index_page.next_page_url]]  # 与正文页面统一
        type = 'LxmlCssElementRule'
        kwargs.str_pattern = 'none[class=none]'  # 随便写的，这里没有下一页
        kwargs.attributes = []