Skip to content

Commit a074391

Browse files
authored
Merge pull request #13 from mbelsky/playwright-draft
Implement Playwright scraper
2 parents f2448fd + 057165d commit a074391

13 files changed

+384
-6
lines changed

.dockerignore

+1
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ Makefile
3333
**/cert.json
3434
**/.env*
3535
**/.nvmrc
36+
seccomp_profile.json

.github/workflows/build-push-docker.yml

+9-1
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,18 @@ jobs:
99
steps:
1010
- name: Check out the repo
1111
uses: actions/checkout@v2
12-
- name: Push to Docker Hub
12+
- name: Push hltv-featured to Docker Hub
1313
uses: docker/[email protected]
1414
with:
1515
username: ${{ secrets.DOCKER_USERNAME }}
1616
password: ${{ secrets.DOCKER_PASSWORD }}
1717
repository: ${{ secrets.DOCKER_USERNAME }}/hltv-featured
1818
tag_with_ref: true
19+
- name: Push hltv-featured-playwright to Docker Hub
20+
uses: docker/[email protected]
21+
with:
22+
username: ${{ secrets.DOCKER_USERNAME }}
23+
password: ${{ secrets.DOCKER_PASSWORD }}
24+
repository: ${{ secrets.DOCKER_USERNAME }}/hltv-featured-playwright
25+
build_args: NODEIMG=mcr.microsoft.com/playwright:v1.31.0-focal
26+
tag_with_ref: true

.vscode/launch.json

+6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
"name": "Launch Scraper",
2323
"program": "${workspaceFolder}/src/scraper/scraper.js"
2424
},
25+
{
26+
"type": "node",
27+
"request": "launch",
28+
"name": "Launch Playwright Scraper",
29+
"program": "${workspaceFolder}/src/playwright-scraper/playwright-scraper.js"
30+
},
2531
{
2632
"type": "node",
2733
"request": "launch",

Makefile

+9
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ ROOT_DIR:=$(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
22
COMMON_RUN_ARGS:=-it --env-file ./.env --env FIREBASE_CONFIG=/data/cert.json --mount type=bind,src=$(ROOT_DIR)/cert.json,dst=/data/cert.json,ro
33
IMGV?=latest
44
IMG:=mbelsky/hltv-featured:$(IMGV)
5+
PLAYWRIGHT_IMG:=mbelsky/hltv-featured-playwright:$(IMGV)
56

67
.PHONY: build
78
build:
@@ -22,3 +23,11 @@ bot:
2223
.PHONY: cron
2324
cron:
2425
docker run --name=hltv-featured-cron $(COMMON_RUN_ARGS) --env CRON=true -d --restart on-failure:3 $(IMG)
26+
27+
.PHONY: build-playwright
28+
build-playwright:
29+
docker build --build-arg NODEIMG=mcr.microsoft.com/playwright:v1.31.0-focal -t $(PLAYWRIGHT_IMG) .
30+
31+
.PHONY: playwright-scraper
32+
playwright-scraper:
33+
docker run $(COMMON_RUN_ARGS) --rm --ipc=host --user pwuser --security-opt seccomp=seccomp_profile.json $(PLAYWRIGHT_IMG) node /app/src/playwright-scraper/playwright-scraper.js

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ This is a set of nodejs applications to get data about upcoming CSGO matches and
77
- `src/backend` handles subscribers requests
88
- `src/notifier` deliveries data to subscribers
99
- `src/scraper` parses HLTV.org/matches
10+
- `src/playwright-scraper` same as scraper but uses playwright
1011

1112
## Configure server
1213

crond/crontab

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
0 5 * * * docker start hltv-featured-notifier
2-
55 4-21 * * * docker start hltv-featured-scraper
2+
55 4-21 * * * docker start hltv-featured-playwright-scraper
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#! /bin/sh
2+
3+
docker create \
4+
--name=hltv-featured-playwright-scraper \
5+
--env-file /???/.env \
6+
--env FIREBASE_CONFIG=/data/cert.json \
7+
--mount type=bind,src=/???/cert.json,dst=/data/cert.json,ro \
8+
--ipc=host --user pwuser --security-opt seccomp=/???/seccomp_profile.json \
9+
mbelsky/hltv-featured-playwright:v??? \
10+
node /app/src/playwright-scraper/playwright-scraper.js

seccomp_profile.json

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"comment": "Allow create user namespaces",
3+
"names": ["clone", "setns", "unshare"],
4+
"action": "SCMP_ACT_ALLOW",
5+
"args": [],
6+
"includes": {},
7+
"excludes": {}
8+
}

src/playwright-scraper/get-html.js

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
const alerter = require('@hltvf/monitoring/alerter')
2+
const playwright = require('playwright-chromium')
3+
4+
async function launchChromium() {
5+
return await playwright.chromium.launch()
6+
}
7+
8+
async function scrap(browser, url) {
9+
const context = await browser.newContext({
10+
acceptDownloads: false,
11+
javaScriptEnabled: false,
12+
locale: 'en-GB',
13+
})
14+
15+
const page = await context.newPage()
16+
17+
page.setExtraHTTPHeaders({
18+
accept:
19+
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
20+
'accept-language': 'en-US,en;q=0.9',
21+
'cache-control': 'no-cache',
22+
pragma: 'no-cache',
23+
'sec-ch-ua':
24+
'"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
25+
'sec-ch-ua-mobile': '?0',
26+
'sec-ch-ua-platform': '"macOS"',
27+
'sec-fetch-dest': 'document',
28+
'sec-fetch-mode': 'navigate',
29+
'sec-fetch-site': 'none',
30+
'sec-fetch-user': '?1',
31+
'upgrade-insecure-requests': '1',
32+
'user-agent':
33+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
34+
})
35+
36+
// Too lazy to remove these comments
37+
// const requestPromise = page.waitForRequest(url, {
38+
// timeout: 5000,
39+
// })
40+
// const responsePromise = page.waitForResponse(url, {
41+
// timeout: 5000,
42+
// })
43+
44+
// const [req, res] = await Promise.all([
45+
// requestPromise,
46+
// responsePromise,
47+
// page.goto(url),
48+
// ])
49+
50+
// const resH = await res.allHeaders()
51+
// const reqH = await req.allHeaders()
52+
// const text = await res.text()
53+
54+
await page.goto(url)
55+
56+
const innerHtml = await page.content()
57+
return innerHtml
58+
}
59+
60+
async function getHtml(url) {
61+
const browser = await launchChromium()
62+
63+
try {
64+
const result = await scrap(browser, url)
65+
66+
return result
67+
} finally {
68+
try {
69+
await browser.close()
70+
} catch (e) {
71+
alerter.error('Failed to close browser', e)
72+
}
73+
}
74+
}
75+
76+
module.exports = {
77+
getHtml,
78+
}

src/playwright-scraper/package.json

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"name": "playwright-scraper",
3+
"version": "0.1.0",
4+
"main": "playwright-scraper.js",
5+
"license": "GNU GPLv3",
6+
"private": true,
7+
"optionalDependencies": {
8+
"playwright-chromium": "1.18.0"
9+
}
10+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
const alerter = require('@hltvf/monitoring/alerter')
2+
const log = require('@hltvf/monitoring/logger').logFabric(
3+
'requests-playwright-scraper',
4+
)
5+
const htmlToMatches = require('@hltvf/scraper/parseHtml')
6+
7+
const { getTeams } = require('common/getTeams')
8+
const {
9+
removeOutdatedMatches,
10+
saveFeaturedMatches,
11+
} = require('common/manageMatches')
12+
const { addTeams } = require('common/manageTeams')
13+
const { getHtml } = require('./get-html')
14+
15+
const root = process.env.ROOT_URL || 'https://www.hltv.org'
16+
const url = process.env.PAGE_URL || 'https://www.hltv.org/matches'
17+
18+
const logResult = (data, matches) => {
19+
if (!matches.length) {
20+
alerter.warn('Zero matches scraped. HTML:\n\n' + data)
21+
}
22+
23+
const message = `Scraped & saved ${matches.length} matches`
24+
console.log(new Date().toUTCString(), message)
25+
26+
if ('production' === process.env.NODE_ENV) {
27+
log({
28+
level: 'info',
29+
data: {
30+
message,
31+
},
32+
})
33+
}
34+
}
35+
36+
function scrap() {
37+
log({
38+
level: 'info',
39+
data: {
40+
message: 'Playwright-scraper has started',
41+
},
42+
})
43+
44+
return removeOutdatedMatches()
45+
.catch(alerter.error)
46+
.then(() => getHtml(url))
47+
.then(
48+
(data) => {
49+
const matches = htmlToMatches(data, { root })
50+
const teams = getTeams({ alerter, matches })
51+
52+
const addTeamsPromise = addTeams(teams).catch(alerter.error)
53+
const saveMatchesPromise = saveFeaturedMatches(matches)
54+
.then(() => logResult(data, matches))
55+
.catch(alerter.error)
56+
57+
return Promise.all([addTeamsPromise, saveMatchesPromise])
58+
},
59+
(error) => {
60+
alerter.error('Failed to get html', error)
61+
},
62+
)
63+
}
64+
65+
if (require.main === module) {
66+
scrap()
67+
} else {
68+
module.exports = scrap
69+
}

src/scraper/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"name": "scraper",
2+
"name": "@hltvf/scraper",
33
"version": "0.1.0",
44
"main": "scraper.js",
55
"license": "GNU GPLv3",

0 commit comments

Comments
 (0)