Skip to content

Commit 1a99ce6

Browse files
authored
src/search refactor + new endpoint: AI Search Autocomplete (#52822)
1 parent c0c5b6a commit 1a99ce6

File tree

92 files changed

+3697
-2454
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+3697
-2454
lines changed

.github/workflows/index-autocomplete-elasticsearch.yml .github/workflows/index-autocomplete-search.yml

+11-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
name: Index autocomplete Elasticsearch
1+
name: Index autocomplete search in Elasticsearch
22

3-
# **What it does**: Indexes autocomplete data into Elasticsearch.
4-
# **Why we have it**: So we can power the API for autocomplete.
3+
# **What it does**: Indexes autocomplete data (general and AI search) into Elasticsearch.
4+
# **Why we have it**: So we can power the APIs for autocomplete.
55
# **Who does it impact**: docs-engineering
66

77
on:
@@ -10,7 +10,7 @@ on:
1010
- cron: '20 16 * * *' # Run every day at 16:20 UTC / 8:20 PST
1111
pull_request:
1212
paths:
13-
- .github/workflows/index-autocomplete-elasticsearch.yml
13+
- .github/workflows/index-autocomplete-search.yml
1414
- 'src/search/scripts/index/**'
1515
- 'package*.json'
1616

@@ -40,10 +40,15 @@ jobs:
4040
if: ${{ github.event_name == 'pull_request' }}
4141
run: curl --fail --retry-connrefused --retry 5 -I http://localhost:9200
4242

43-
- name: Run indexing
43+
- name: Run general auto-complete indexing
4444
env:
4545
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
46-
run: npm run index -- autocomplete docs-internal-data
46+
run: npm run index-general-autocomplete -- docs-internal-data
47+
48+
- name: Run AI search auto-complete indexing
49+
env:
50+
ELASTICSEARCH_URL: ${{ github.event_name == 'pull_request' && 'http://localhost:9200' || secrets.ELASTICSEARCH_URL }}
51+
run: npm run index-ai-search-autocomplete -- docs-internal-data
4752

4853
- uses: ./.github/actions/slack-alert
4954
if: ${{ failure() && github.event_name == 'schedule' }}

.github/workflows/sync-search-pr.yml .github/workflows/index-general-search-pr.yml

+9-14
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
name: Sync search - PR
1+
name: Index general search in Elasticsearch on PR
22

3-
# **What it does**: This does what `sync-sarch-elasticsearch.yml` does but
3+
# **What it does**: This does what `index-general-search-elasticsearch.yml` does but
44
# with a localhost Elasticsearch and only for English.
55
# **Why we have it**: To test that the script works and the popular pages json is valid.
66
# **Who does it impact**: Docs engineering
@@ -11,8 +11,8 @@ on:
1111
paths:
1212
- 'src/search/**'
1313
- 'package*.json'
14-
# Ultimately, for debugging this workflow itself
15-
- .github/workflows/sync-search-pr.yml
14+
# For debugging this workflow
15+
- .github/workflows/index-general-search-pr.yml
1616
# Make sure we run this if the composite action changes
1717
- .github/actions/setup-elasticsearch/action.yml
1818

@@ -25,9 +25,6 @@ concurrency:
2525
cancel-in-progress: true
2626

2727
env:
28-
# Yes, it's hardcoded but it makes all the steps look exactly the same
29-
# as they do in `sync-search-elasticsearch.yml` where it uses
30-
# that `${{ env.ELASTICSEARCH_URL }}`
3128
ELASTICSEARCH_URL: http://localhost:9200
3229
# Since we'll run in NDOE_ENV=production, we need to be explicit that
3330
# we don't want Hydro configured.
@@ -63,7 +60,7 @@ jobs:
6360
env:
6461
ENABLE_DEV_LOGGING: false
6562
run: |
66-
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
63+
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
6764
6865
# first sleep to give it a chance to start
6966
sleep 6
@@ -88,15 +85,13 @@ jobs:
8885
# let's just accept an empty string instead.
8986
THROW_ON_EMPTY: false
9087

91-
# The sync-search-index recognizes this env var if you don't
92-
# use the `--docs-internal-data <PATH>` option.
9388
DOCS_INTERNAL_DATA: docs-internal-data
9489

9590
run: |
9691
mkdir /tmp/records
97-
npm run sync-search-indices -- /tmp/records \
92+
npm run general-search-scrape -- /tmp/records \
9893
--language en \
99-
--version dotcom
94+
--version fpt
10095
10196
ls -lh /tmp/records
10297
@@ -106,9 +101,9 @@ jobs:
106101
107102
- name: Index into Elasticsearch
108103
run: |
109-
npm run index-elasticsearch -- /tmp/records \
104+
npm run index-general-search -- /tmp/records \
110105
--language en \
111-
--version dotcom
106+
--version fpt
112107
113108
- name: Check created indexes and aliases
114109
run: |

.github/workflows/sync-search-elasticsearch.yml .github/workflows/index-general-search.yml

+5-7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Sync search Elasticsearch
1+
name: Index general search in Elasticsearch
22

33
# **What it does**: It scrapes the whole site and dumps the records in a
44
# temp directory. Then it indexes that into Elasticsearch.
@@ -140,7 +140,7 @@ jobs:
140140
env:
141141
ENABLE_DEV_LOGGING: false
142142
run: |
143-
npm run sync-search-server > /tmp/stdout.log 2> /tmp/stderr.log &
143+
npm run general-search-scrape-server > /tmp/stdout.log 2> /tmp/stderr.log &
144144
145145
# first sleep to give it a chance to start
146146
sleep 6
@@ -169,13 +169,11 @@ jobs:
169169
# the same as not set within the script.
170170
VERSION: ${{ inputs.version }}
171171

172-
# The sync-search-index recognizes this env var if you don't
173-
# use the `--docs-internal-data <PATH>` option.
174172
DOCS_INTERNAL_DATA: docs-internal-data
175173

176174
run: |
177175
mkdir /tmp/records
178-
npm run sync-search-indices -- /tmp/records \
176+
npm run general-search-scrape -- /tmp/records \
179177
--language ${{ matrix.language }}
180178
181179
ls -lh /tmp/records
@@ -186,12 +184,12 @@ jobs:
186184
187185
- name: Index into Elasticsearch
188186
env:
189-
# Must match what we used when scraping (npm run sync-search-indices)
187+
# Must match what we used when scraping (npm run general-search-scrape)
190188
# otherwise the script will seek other versions from disk that might
191189
# not exist.
192190
VERSION: ${{ inputs.version }}
193191
run: |
194-
npm run index-elasticsearch -- /tmp/records \
192+
npm run index-general-search -- /tmp/records \
195193
--language ${{ matrix.language }} \
196194
--stagger-seconds 5 \
197195
--retries 5

.gitignore

+6
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,9 @@ assets/images/help/writing/unordered-list-rendered (1).png
5151

5252
# Used by precompute-pageinfo
5353
.pageinfo-cache.json.br
54+
55+
# Cloned and used for indexing Elasticsearch data
56+
docs-internal-data/
57+
58+
# For intermediate data (like scraping for Elasticsearch indexing)
59+
tmp/

package-lock.json

+27
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

+13-7
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"exports": "./src/frame/server.ts",
1818
"scripts": {
1919
"all-documents": "tsx src/content-render/scripts/all-documents/cli.ts",
20-
"analyze-text": "node src/search/scripts/analyze-text.js",
20+
"analyze-text": "tsx src/search/scripts/analyze-text.ts",
2121
"analyze-comment": "tsx src/events/scripts/analyze-comment-cli.ts",
2222
"archive-version": "tsx --max-old-space-size=16384 src/ghes-releases/scripts/archive-version.ts",
2323
"audit-log-sync": "tsx src/audit-logs/scripts/sync.ts",
@@ -39,8 +39,14 @@
3939
"find-unused-variables": "tsx src/content-linter/scripts/find-unsed-variables.ts",
4040
"fixture-dev": "cross-env ROOT=src/fixtures/fixtures npm start",
4141
"fixture-test": "cross-env ROOT=src/fixtures/fixtures npm test -- src/fixtures/tests",
42-
"index": "tsx src/search/scripts/index/index.ts",
43-
"index-elasticsearch": "node src/search/scripts/index-elasticsearch.js",
42+
"general-search-scrape": "tsx src/search/scripts/scrape/scrape-cli.ts",
43+
"general-search-scrape-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
44+
"ghes-release-scrape-with-server": "cross-env GHES_RELEASE=1 start-server-and-test general-search-scrape-server 4002 general-search-scrape",
45+
"general-search-scrape-with-server": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test general-search-scrape-server 4002 general-search-scrape",
46+
"index": "tsx src/search/scripts/index/index-cli autocomplete docs-internal-data",
47+
"index-ai-search-autocomplete": "tsx src/search/scripts/index/index-cli ai-search-autocomplete",
48+
"index-general-autocomplete": "tsx src/search/scripts/index/index-cli general-autocomplete",
49+
"index-general-search": "tsx src/search/scripts/index/index-cli general-search",
4450
"index-test-fixtures": "./src/search/scripts/index-test-fixtures.sh",
4551
"lint": "eslint '**/*.{js,mjs,ts,tsx}'",
4652
"lint-content": "node src/content-linter/scripts/lint-content.js",
@@ -70,10 +76,6 @@
7076
"start-for-playwright": "cross-env ROOT=src/fixtures/fixtures TRANSLATIONS_FIXTURE_ROOT=src/fixtures/fixtures/translations ENABLED_LANGUAGES=en,ja NODE_ENV=test tsx src/frame/server.ts",
7177
"symlink-from-local-repo": "node src/early-access/scripts/symlink-from-local-repo.js",
7278
"sync-rest": "tsx src/rest/scripts/update-files.ts",
73-
"sync-search": "cross-env NODE_OPTIONS='--max_old_space_size=8192' start-server-and-test sync-search-server 4002 sync-search-indices",
74-
"sync-search-ghes-release": "cross-env GHES_RELEASE=1 start-server-and-test sync-search-server 4002 sync-search-indices",
75-
"sync-search-indices": "node src/search/scripts/sync-search-indices.js",
76-
"sync-search-server": "cross-env NODE_ENV=production PORT=4002 MINIMAL_RENDER=true CHANGELOG_DISABLED=true tsx src/frame/server.ts",
7779
"sync-secret-scanning": "tsx src/secret-scanning/scripts/sync.ts",
7880
"sync-webhooks": "npx tsx src/rest/scripts/update-files.ts -o webhooks",
7981
"test": "vitest",
@@ -222,6 +224,7 @@
222224
"src/open-source/scripts/add-pr-links.js",
223225
"src/open-source/scripts/pr-link-source.js",
224226
"rest-api-description/",
227+
"docs-internal-data/",
225228
"src/code-scanning/scripts/generate-code-scanning-query-list.ts"
226229
]
227230
},
@@ -327,10 +330,13 @@
327330
"@octokit/rest": "21.0.2",
328331
"@playwright/test": "^1.48.1",
329332
"@types/accept-language-parser": "1.5.6",
333+
"@types/cheerio": "^0.22.35",
330334
"@types/connect-datadog": "0.0.10",
331335
"@types/connect-timeout": "0.0.39",
332336
"@types/cookie": "0.6.0",
333337
"@types/cookie-parser": "1.4.7",
338+
"@types/elasticsearch": "^5.0.43",
339+
"@types/event-to-promise": "^0.7.5",
334340
"@types/express": "4.17.21",
335341
"@types/imurmurhash": "^0.1.4",
336342
"@types/js-cookie": "^3.0.6",

src/fixtures/tests/breadcrumbs.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ describe('breadcrumbs', () => {
6868

6969
expect($breadcrumbTitles.length).toBe(0)
7070
expect($breadcrumbLinks.length).toBe(2)
71-
expect($breadcrumbLinks[0].attribs.title).toBe('Deeper secrets')
72-
expect($breadcrumbLinks[1].attribs.title).toBe('Mariana Trench')
71+
expect(($breadcrumbLinks[0] as cheerio.TagElement).attribs.title).toBe('Deeper secrets')
72+
expect(($breadcrumbLinks[1] as cheerio.TagElement).attribs.title).toBe('Mariana Trench')
7373
})
7474
})

src/frame/middleware/api.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { createProxyMiddleware } from 'http-proxy-middleware'
33

44
import events from '@/events/middleware.js'
55
import anchorRedirect from '@/rest/api/anchor-redirect.js'
6-
import search from '@/search/middleware/search.js'
6+
import search from '@/search/middleware/search-routes.js'
77
import pageInfo from '@/pageinfo/middleware'
88
import pageList from '@/pagelist/middleware'
99
import webhooks from '@/webhooks/middleware/webhooks.js'

src/frame/middleware/index.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ import fastlyCacheTest from './fastly-cache-test'
6161
import trailingSlashes from './trailing-slashes'
6262
import mockVaPortal from './mock-va-portal'
6363
import dynamicAssets from '@/assets/middleware/dynamic-assets'
64-
import contextualizeSearch from '@/search/middleware/contextualize.js'
64+
import generalSearchMiddleware from '@/search/middleware/general-search-middleware'
6565
import shielding from '@/shielding/middleware'
6666
import tracking from '@/tracking/middleware'
6767
import { MAX_REQUEST_TIMEOUT } from '@/frame/lib/constants.js'
@@ -275,7 +275,7 @@ export default function (app: Express) {
275275
app.use(asyncMiddleware(productExamples))
276276
app.use(asyncMiddleware(productGroups))
277277
app.use(asyncMiddleware(glossaries))
278-
app.use(asyncMiddleware(contextualizeSearch))
278+
app.use(asyncMiddleware(generalSearchMiddleware))
279279
app.use(asyncMiddleware(featuredLinks))
280280
app.use(asyncMiddleware(learningTrack))
281281

src/frame/tests/favicons.ts

+9-3
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@ describe('favicon assets', () => {
1515
expect(res.headers['cache-control']).toContain('public')
1616
expect(res.headers['cache-control']).toContain('immutable')
1717
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
18-
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
18+
const maxAgeSeconds = parseInt(
19+
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
20+
10,
21+
)
1922
// Let's not be too specific in the tests, just as long as it's testing
2023
// that it's a reasonably large number of seconds.
2124
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)
@@ -25,13 +28,16 @@ describe('favicon assets', () => {
2528
test('should serve a valid and aggressively caching /apple-touch-icon.png', async () => {
2629
const res = await get('/apple-touch-icon.png')
2730
expect(res.statusCode).toBe(200)
28-
expect(parseInt(res.headers['content-length'], 10)).toBeGreaterThan(0)
31+
expect(parseInt(res.headers['content-length'] || '', 10)).toBeGreaterThan(0)
2932
expect(res.headers['content-type']).toBe('image/png')
3033
expect(res.headers['set-cookie']).toBeUndefined()
3134
expect(res.headers['cache-control']).toContain('public')
3235
expect(res.headers['cache-control']).toContain('immutable')
3336
expect(res.headers['cache-control']).toMatch(/max-age=\d+/)
34-
const maxAgeSeconds = parseInt(res.headers['cache-control'].match(/max-age=(\d+)/)[1], 10)
37+
const maxAgeSeconds = parseInt(
38+
(res.headers['cache-control'] || '').match(/max-age=(\d+)/)?.[1] || '',
39+
10,
40+
)
3541
// Let's not be too specific in the tests, just as long as it's testing
3642
// that it's a reasonably large number of seconds.
3743
expect(maxAgeSeconds).toBeGreaterThanOrEqual(60 * 60)

src/frame/tests/manifest.ts

+3
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ describe('manifest', () => {
2020
test('download manifest from HTML and check content', async () => {
2121
const $ = await getDOM('/')
2222
const url = $('link[rel="manifest"]').attr('href')
23+
if (!url) {
24+
throw new Error('No manifest URL found')
25+
}
2326
const res = await get(url)
2427
expect(res.statusCode).toBe(200)
2528

src/ghes-releases/lib/release-templates/release-steps-1.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ labels:
1717
- [Prerequisites](#prerequisites)
1818
- [Create publication branch for a new version of GHES](#creation)
1919
- [Resolve check failures](#check-failures)
20-
- [Sync the search indices](#sync-search-indices)
20+
- [Scrape the search indices](#scrape-search-indices)
2121
- [Maintain the publication branch](#maintenance)
2222
- [Complete preparation for the RC and publish the docset](#publication)
2323

@@ -110,11 +110,11 @@ For content from the OpenAPI schema, note the affected content with broken links
110110

111111
<br/>
112112

113-
<a name="sync-search-indices">
113+
<a name="scrape-search-indices">
114114

115-
### [🔎](#sync-search-indices) Sync the search indices
115+
### [🔎](#scrape-search-indices) Scrape the search indices
116116

117-
1. Go to the [`sync-search-elasticsearch` workflow](https://github.com/github/docs-internal/actions/workflows/sync-search-elasticsearch.yml) ([permalink](https://github.com/github/docs-internal/blob/f8ca45703c48c7d1976a278337bc3391fb14fe9e/.github/workflows/sync-search-elasticsearch.yml) in case it moves)
117+
1. Go to the [`index-general-search.yml` workflow](https://github.com/github/docs-internal/actions/workflows/index-general-search.yml)
118118
1. Click on the **Run workflow** drop down and set the following parameters:
119119
- `Branch:` set to the name of the publication branch
120120
- `Version` set to the version you're publishing (e.g., `ghes-3.12` if you're publishing GHES 3.12)

0 commit comments

Comments
 (0)