-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path1ecb20d1.html
More file actions
369 lines (261 loc) · 28.9 KB
/
1ecb20d1.html
File metadata and controls
369 lines (261 loc) · 28.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222"><meta name="generator" content="Hexo 6.3.0">
<link rel="apple-touch-icon" sizes="180x180" href="/images/apple-touch-icon-next.png">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon.png">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon.png">
<link rel="mask-icon" href="/images/logo.svg" color="#222">
<link rel="stylesheet" href="/css/main.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css" integrity="sha256-HtsXJanqjKTc8vVQjO4YMhiqFoXkfBsjBWcX91T1jr8=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">
<script class="next-config" data-name="main" type="application/json">{"hostname":"tallate.github.io","root":"/","images":"/images","scheme":"Gemini","darkmode":false,"version":"8.18.0","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":{"enable":false,"style":null},"fold":{"enable":false,"height":500},"bookmark":{"enable":false,"color":"#222","save":"auto"},"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"stickytabs":false,"motion":{"enable":true,"async":false,"transition":{"menu_item":"fadeInDown","post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"prism":false,"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果:${query}","hits_time":"找到 ${hits} 个搜索结果(用时 ${time} 毫秒)","hits":"找到 ${hits} 个搜索结果"}}</script><script src="/js/config.js"></script>
<meta name="description" content="经常想搜某个文件夹、站点内的所有内容,而网页会套网页、文件夹会套文件夹,没想到有什么特别好的搜索工具能递归地找出所有内容,于是想到要不把数据都爬到es再提供全文查询,形成自己的网页知识库。">
<meta property="og:type" content="article">
<meta property="og:title" content="建立网页索引">
<meta property="og:url" content="https://tallate.github.io/1ecb20d1.html">
<meta property="og:site_name" content="Tallate">
<meta property="og:description" content="经常想搜某个文件夹、站点内的所有内容,而网页会套网页、文件夹会套文件夹,没想到有什么特别好的搜索工具能递归地找出所有内容,于是想到要不把数据都爬到es再提供全文查询,形成自己的网页知识库。">
<meta property="og:locale" content="zh_CN">
<meta property="article:published_time" content="2020-12-25T03:12:18.000Z">
<meta property="article:modified_time" content="2025-07-06T17:56:20.908Z">
<meta property="article:author" content="tallate">
<meta property="article:tag" content="爬虫">
<meta property="article:tag" content="Elasticsearch">
<meta name="twitter:card" content="summary">
<link rel="canonical" href="https://tallate.github.io/1ecb20d1.html">
<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":false,"isPost":true,"lang":"zh-CN","comments":true,"permalink":"https://tallate.github.io/1ecb20d1.html","path":"/1ecb20d1.html","title":"建立网页索引"}</script>
<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>建立网页索引 | Tallate</title>
<noscript>
<link rel="stylesheet" href="/css/noscript.css">
</noscript>
</head>
<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
<div class="headband"></div>
<main class="main">
<div class="column">
<header class="header" itemscope itemtype="http://schema.org/WPHeader"><div class="site-brand-container">
<div class="site-nav-toggle">
<div class="toggle" aria-label="切换导航栏" role="button">
<span class="toggle-line"></span>
<span class="toggle-line"></span>
<span class="toggle-line"></span>
</div>
</div>
<div class="site-meta">
<a href="/" class="brand" rel="start">
<i class="logo-line"></i>
<p class="site-title">Tallate</p>
<i class="logo-line"></i>
</a>
<p class="site-subtitle" itemprop="description">该吃吃该喝喝 啥事别往心里搁</p>
</div>
<div class="site-nav-right">
<div class="toggle popup-trigger" aria-label="搜索" role="button">
<i class="fa fa-search fa-fw fa-lg"></i>
</div>
</div>
</div>
<nav class="site-nav">
<ul class="main-menu menu"><li class="menu-item menu-item-home"><a href="/" rel="section"><i class="home fa-fw"></i>首页</a></li><li class="menu-item menu-item-about"><a href="/about/" rel="section"><i class="user fa-fw"></i>关于</a></li><li class="menu-item menu-item-tags"><a href="/tags/" rel="section"><i class="tags fa-fw"></i>标签<span class="badge">84</span></a></li><li class="menu-item menu-item-categories"><a href="/categories/" rel="section"><i class="th fa-fw"></i>分类<span class="badge">25</span></a></li><li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="archive fa-fw"></i>归档<span class="badge">192</span></a></li>
<li class="menu-item menu-item-search">
<a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
</a>
</li>
</ul>
</nav>
<div class="search-pop-overlay">
<div class="popup search-popup"><div class="search-header">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<div class="search-input-container">
<input autocomplete="off" autocapitalize="off" maxlength="80"
placeholder="搜索..." spellcheck="false"
type="search" class="search-input">
</div>
<span class="popup-btn-close" role="button">
<i class="fa fa-times-circle"></i>
</span>
</div>
<div class="search-result-container no-result">
<div class="search-result-icon">
<i class="fa fa-spinner fa-pulse fa-5x"></i>
</div>
</div>
</div>
</div>
</header>
<aside class="sidebar">
<div class="sidebar-inner sidebar-nav-active sidebar-toc-active">
<ul class="sidebar-nav">
<li class="sidebar-nav-toc">
文章目录
</li>
<li class="sidebar-nav-overview">
站点概览
</li>
</ul>
<div class="sidebar-panel-container">
<!--noindex-->
<div class="post-toc-wrap sidebar-panel">
<div class="post-toc animated"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#%E4%BD%BF%E7%94%A8"><span class="nav-number">1.</span> <span class="nav-text">使用</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#HTTP"><span class="nav-number">1.1.</span> <span class="nav-text">HTTP</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#postman"><span class="nav-number">1.2.</span> <span class="nav-text">postman</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E7%88%AC%E8%99%AB%E4%BB%A3%E7%A0%81"><span class="nav-number">2.</span> <span class="nav-text">爬虫代码</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E7%B4%A2%E5%BC%95%E5%BB%BA%E7%AB%8B%E4%BB%A3%E7%A0%81"><span class="nav-number">3.</span> <span class="nav-text">索引建立代码</span></a></li></ol></div>
</div>
<!--/noindex-->
<div class="site-overview-wrap sidebar-panel">
<div class="site-author animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
<p class="site-author-name" itemprop="name">tallate</p>
<div class="site-description" itemprop="description"></div>
</div>
<div class="site-state-wrap animated">
<nav class="site-state">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">192</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories/">
<span class="site-state-item-count">25</span>
<span class="site-state-item-name">分类</span></a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags/">
<span class="site-state-item-count">84</span>
<span class="site-state-item-name">标签</span></a>
</div>
</nav>
</div>
</div>
</div>
</div>
</aside>
</div>
<div class="main-inner post posts-expand">
<div class="post-block">
<article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="https://tallate.github.io/1ecb20d1.html">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar.gif">
<meta itemprop="name" content="tallate">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Tallate">
<meta itemprop="description" content="">
</span>
<span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
<meta itemprop="name" content="建立网页索引 | Tallate">
<meta itemprop="description" content="">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
建立网页索引
</h1>
<div class="post-meta-container">
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2020-12-25 11:12:18" itemprop="dateCreated datePublished" datetime="2020-12-25T11:12:18+08:00">2020-12-25</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar-check"></i>
</span>
<span class="post-meta-item-text">更新于</span>
<time title="修改时间:2025-07-07 01:56:20" itemprop="dateModified" datetime="2025-07-07T01:56:20+08:00">2025-07-07</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-folder"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/%E8%AE%BE%E8%AE%A1/" itemprop="url" rel="index"><span itemprop="name">设计</span></a>
</span>
</span>
</div>
</div>
</header>
<div class="post-body" itemprop="articleBody"><p>经常想搜某个文件夹、站点内的所有内容,而网页会套网页、文件夹会套文件夹,没想到有什么特别好的搜索工具能递归地找出所有内容,于是想到要不把数据都爬到es再提供全文查询,形成自己的网页知识库。</p>
<span id="more"></span>
<p><strong>项目:<a target="_blank" rel="noopener" href="https://github.com/tallate/docindex">tallate / docindex</a></strong></p>
<h1 id="使用"><a href="#使用" class="headerlink" title="使用"></a>使用</h1><p>需要先搭建ES环境,可以参考我记录的ES使用方法:<a href="https://tallate.github.io/c395b48b.html">使用 ES</a>。</p>
<h2 id="HTTP"><a href="#HTTP" class="headerlink" title="HTTP"></a>HTTP</h2><p>提供HTTP接口:PageCrawlController<br>抓取POST: localhost:8080/page/crawl</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br></pre></td><td class="code"><pre><span class="line">{</span><br><span class="line"> "url": "https://tallate.github.io/c395b48b.html",</span><br><span class="line"> "depth": 2</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>查询GET: localhost:8080/page/crawl</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">{</span><br><span class="line"> "searchKey": "使用 ES"</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<h2 id="postman"><a href="#postman" class="headerlink" title="postman"></a>postman</h2><p>localhost:9200/page/page/_search</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">{</span><br><span class="line"> "from": 0,</span><br><span class="line"> "size": 5,</span><br><span class="line"> "query": {</span><br><span class="line"> "match": {</span><br><span class="line"> "content": "使用 ES"</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<h1 id="爬虫代码"><a href="#爬虫代码" class="headerlink" title="爬虫代码"></a>爬虫代码</h1><ol>
<li>读取网页内容<br>网页是html格式的,为了方便我直接选择用jsoup解析,网页中有很多标签,我需要做的就是读取这些标签中的内容,期间注意过滤无效的标签,比如style、script这种;<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br></pre></td><td class="code"><pre><span class="line">Elements allElements = doc.getAllElements();</span><br><span class="line">StringBuilder contentBuilder = new StringBuilder();</span><br><span class="line">for (Element e : allElements) {</span><br><span class="line"> // 一些特殊标签忽略,比如style这种</span><br><span class="line"> if (excludeTags.contains(e.tag().getName())) {</span><br><span class="line"> continue;</span><br><span class="line"> }</span><br><span class="line"> contentBuilder.append(" ").append(e.ownText());</span><br><span class="line">}</span><br><span class="line">page.setContent(contentBuilder.toString());</span><br></pre></td></tr></table></figure></li>
<li>递归读取<br>网页通过链接互相关联,读取网页后解析出其中的<code><a></code>、<code><link></code>即可得到关联网页的内容。<br>我这波忙活的主要目的其实也是尽量读取更多的数据供查询,不然和直接打开网页查询没有区别。<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br></pre></td><td class="code"><pre><span class="line">// 递归访问子链接</span><br><span class="line">Set<String> childUrls = Sets.newHashSet();</span><br><span class="line">Elements elements = doc.getAllElements();</span><br><span class="line">for (Element element : elements) {</span><br><span class="line"> List<String> as = getChildUrls(element, "a");</span><br><span class="line"> List<String> links = getChildUrls(element, "link");</span><br><span class="line"> childUrls.addAll(as);</span><br><span class="line"> childUrls.addAll(links);</span><br><span class="line">}</span><br><span class="line">// 过滤无效的</span><br><span class="line">childUrls = childUrls.stream()</span><br><span class="line"> .filter(u -> isUrlValid(crawlId, u))</span><br><span class="line"> .collect(Collectors.toSet());</span><br><span class="line">log.info("找出的子链接, childUrls:{}", childUrls);</span><br><span class="line">page.getAs().addAll(childUrls);</span><br><span class="line">for (String childUrl : childUrls) {</span><br><span class="line"> // 递归爬</span><br><span class="line"> threadPool.execute(new CrawlTask(crawlId, childUrl, depth - 1));</span><br><span class="line">}</span><br></pre></td></tr></table></figure></li>
</ol>
<h1 id="索引建立代码"><a href="#索引建立代码" class="headerlink" title="索引建立代码"></a>索引建立代码</h1><p>使用ES的代码就不多说了,就列一下用到的指令了。<br>创建文档POST: localhost:9200/page/page/123</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br></pre></td><td class="code"><pre><span class="line">{</span><br><span class="line"> "as": [</span><br><span class="line"> "a",</span><br><span class="line"> "b"</span><br><span class="line"> ],</span><br><span class="line"> "title": "SSL证书选购",</span><br><span class="line"> "url": "https://buy.cloud.tencent.com/ssl?fromSource=ssl",</span><br><span class="line"> "content": " SSL证书选购 - 腾讯云 控制台 中国站 中国站 International tencent 腾讯开放平台 腾讯会议 DNSPod Discuz! 微信公众平台 腾讯优图 腾讯蓝鲸 腾讯企点 腾讯微云 腾讯文档 友情链接 Copyright © 2013-2020 Tencent Cloud. All Rights Reserved. 腾讯云 版权所有 京公网安备 11010802017518 粤B2-20090059-1 域名注册服务机构批复文号:京信信管发〔2018〕156号 鲁通管〔2019〕83号 粤通业函〔2018〕268号 代理域名注册服务机构:烟台帝思普网络科技有限公司(DNSPod) 新网数码 广州云讯信息科技有限公司 中国站 中国站 International"</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<p>搜索文档GET: localhost:9200/page/page/_search</p>
<figure class="highlight plaintext"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br><span class="line">4</span><br><span class="line">5</span><br><span class="line">6</span><br><span class="line">7</span><br><span class="line">8</span><br><span class="line">9</span><br><span class="line">10</span><br><span class="line">11</span><br><span class="line">12</span><br><span class="line">13</span><br><span class="line">14</span><br><span class="line">15</span><br><span class="line">16</span><br><span class="line">17</span><br><span class="line">18</span><br><span class="line">19</span><br><span class="line">20</span><br><span class="line">21</span><br><span class="line">22</span><br><span class="line">23</span><br><span class="line">24</span><br><span class="line">25</span><br><span class="line">26</span><br><span class="line">27</span><br><span class="line">28</span><br><span class="line">29</span><br><span class="line">30</span><br><span class="line">31</span><br><span class="line">32</span><br><span class="line">33</span><br><span class="line">34</span><br><span class="line">35</span><br><span class="line">36</span><br><span class="line">37</span><br><span class="line">38</span><br><span class="line">39</span><br><span class="line">40</span><br><span class="line">41</span><br><span class="line">42</span><br><span class="line">43</span><br><span class="line">44</span><br><span class="line">45</span><br><span class="line">46</span><br><span class="line">47</span><br><span class="line">48</span><br><span class="line">49</span><br><span class="line">50</span><br></pre></td><td class="code"><pre><span class="line">{</span><br><span class="line"> "from": 0,</span><br><span class="line"> "size": 100,</span><br><span class="line"> "query": {</span><br><span class="line"> "bool": {</span><br><span class="line"> "should": [</span><br><span class="line"> {</span><br><span class="line"> "multi_match": {</span><br><span class="line"> "query": "建立网页索引",</span><br><span class="line"> "fields": [</span><br><span class="line"> "title"</span><br><span class="line"> ],</span><br><span class="line"> "type": "best_fields",</span><br><span class="line"> "operator": "OR",</span><br><span class="line"> "slop": 0,</span><br><span class="line"> "prefix_length": 0,</span><br><span class="line"> "max_expansions": 50,</span><br><span class="line"> "zero_terms_query": "NONE",</span><br><span class="line"> "auto_generate_synonyms_phrase_query": true,</span><br><span class="line"> "fuzzy_transpositions": true,</span><br><span class="line"> "boost": 3</span><br><span class="line"> }</span><br><span class="line"> },</span><br><span class="line"> {</span><br><span class="line"> "multi_match": {</span><br><span class="line"> "query": "建立网页索引",</span><br><span class="line"> "fields": [</span><br><span class="line"> "content"</span><br><span class="line"> ],</span><br><span class="line"> "type": "best_fields",</span><br><span class="line"> "operator": "OR",</span><br><span class="line"> "slop": 0,</span><br><span class="line"> "prefix_length": 0,</span><br><span class="line"> "max_expansions": 50,</span><br><span class="line"> "zero_terms_query": "NONE",</span><br><span class="line"> "auto_generate_synonyms_phrase_query": true,</span><br><span class="line"> "fuzzy_transpositions": true,</span><br><span class="line"> "boost": 1</span><br><span class="line"> }</span><br><span class="line"> }</span><br><span class="line"> ],</span><br><span class="line"> "adjust_pure_negative": true,</span><br><span class="line"> "boost": 1</span><br><span class="line"> }</span><br><span class="line"> },</span><br><span class="line"> "_source": {</span><br><span class="line"> "includes": [],</span><br><span class="line"> "excludes": []</span><br><span class="line"> }</span><br><span class="line">}</span><br></pre></td></tr></table></figure>
<ul>
<li>搜索时,因为我更倾向于匹配title,因此用bool+multi_match查询多个字段,并用boost指定title的权重更高。</li>
<li>需要注意的是elasticsearch客户端的版本要和服务器的版本保持一致,不然可能发生各种问题,比如某些参数不支持之类的。</li>
</ul>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/kity@2.0.4/dist/kity.min.js"></script><script type="text/javascript" src="https://cdn.jsdelivr.net/npm/kityminder-core@1.4.50/dist/kityminder.core.min.js"></script><script defer="true" type="text/javascript" src="https://cdn.jsdelivr.net/npm/hexo-simple-mindmap@0.6.0/dist/mindmap.min.js"></script><link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/npm/hexo-simple-mindmap@0.6.0/dist/mindmap.min.css">
</div>
<footer class="post-footer">
<div class="post-tags">
<a href="/tags/%E7%88%AC%E8%99%AB/" rel="tag"># 爬虫</a>
<a href="/tags/Elasticsearch/" rel="tag"># Elasticsearch</a>
</div>
<div class="post-nav">
<div class="post-nav-item">
<a href="/86e377c4.html" rel="prev" title="ES4_1集群原理">
<i class="fa fa-angle-left"></i> ES4_1集群原理
</a>
</div>
<div class="post-nav-item">
<a href="/e06b1d87.html" rel="next" title="设计秒杀系统">
设计秒杀系统 <i class="fa fa-angle-right"></i>
</a>
</div>
</div>
</footer>
</article>
</div>
</div>
</main>
<footer class="footer">
<div class="footer-inner">
<div class="copyright">
©
<span itemprop="copyrightYear">2025</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">tallate</span>
</div>
<div class="powered-by">由 <a href="https://hexo.io/" rel="noopener" target="_blank">Hexo</a> & <a href="https://theme-next.js.org/" rel="noopener" target="_blank">NexT.Gemini</a> 强力驱动
</div>
</div>
</footer>
<div class="back-to-top" role="button" aria-label="返回顶部">
<i class="fa fa-arrow-up fa-lg"></i>
<span>0%</span>
</div>
<a href="https://github.com/tallate" class="github-corner" title="在 GitHub 上关注我" aria-label="在 GitHub 上关注我" rel="noopener" target="_blank"><svg width="80" height="80" viewBox="0 0 250 250" aria-hidden="true"><path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path><path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2" fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path><path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z" fill="currentColor" class="octo-body"></path></svg></a>
<noscript>
<div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>
<script src="https://cdnjs.cloudflare.com/ajax/libs/animejs/3.2.1/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
<script src="/js/comments.js"></script><script src="/js/utils.js"></script><script src="/js/motion.js"></script><script src="/js/next-boot.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/hexo-generator-searchdb/1.4.1/search.js" integrity="sha256-1kfA5uHPf65M5cphT2dvymhkuyHPQp5A53EGZOnOLmc=" crossorigin="anonymous"></script>
<script src="/js/third-party/search/local-search.js"></script>
<script class="next-config" data-name="mermaid" type="application/json">{"enable":true,"version":"7.1.2","options":null,"js":{"url":"https://cdnjs.cloudflare.com/ajax/libs/mermaid/10.3.0/mermaid.min.js","integrity":"sha256-9y71g5Lz/KLsHjB8uXwnkuWDtAMDSzD/HdIbqhJfTAI="}}</script>
<script src="/js/third-party/tags/mermaid.js"></script>
</body>
</html>