|
| 1 | +const alerter = require('@hltvf/monitoring/alerter') |
| 2 | +const playwright = require('playwright-chromium') |
| 3 | + |
| 4 | +async function launchChromium() { |
| 5 | + return await playwright.chromium.launch() |
| 6 | +} |
| 7 | + |
| 8 | +async function scrap(browser, url) { |
| 9 | + const context = await browser.newContext({ |
| 10 | + acceptDownloads: false, |
| 11 | + javaScriptEnabled: false, |
| 12 | + locale: 'en-GB', |
| 13 | + }) |
| 14 | + |
| 15 | + const page = await context.newPage() |
| 16 | + |
| 17 | + page.setExtraHTTPHeaders({ |
| 18 | + accept: |
| 19 | + 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', |
| 20 | + 'accept-language': 'en-US,en;q=0.9', |
| 21 | + 'cache-control': 'no-cache', |
| 22 | + pragma: 'no-cache', |
| 23 | + 'sec-ch-ua': |
| 24 | + '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"', |
| 25 | + 'sec-ch-ua-mobile': '?0', |
| 26 | + 'sec-ch-ua-platform': '"macOS"', |
| 27 | + 'sec-fetch-dest': 'document', |
| 28 | + 'sec-fetch-mode': 'navigate', |
| 29 | + 'sec-fetch-site': 'none', |
| 30 | + 'sec-fetch-user': '?1', |
| 31 | + 'upgrade-insecure-requests': '1', |
| 32 | + 'user-agent': |
| 33 | + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', |
| 34 | + }) |
| 35 | + |
| 36 | + // Too lazy to remove these comments |
| 37 | + // const requestPromise = page.waitForRequest(url, { |
| 38 | + // timeout: 5000, |
| 39 | + // }) |
| 40 | + // const responsePromise = page.waitForResponse(url, { |
| 41 | + // timeout: 5000, |
| 42 | + // }) |
| 43 | + |
| 44 | + // const [req, res] = await Promise.all([ |
| 45 | + // requestPromise, |
| 46 | + // responsePromise, |
| 47 | + // page.goto(url), |
| 48 | + // ]) |
| 49 | + |
| 50 | + // const resH = await res.allHeaders() |
| 51 | + // const reqH = await req.allHeaders() |
| 52 | + // const text = await res.text() |
| 53 | + |
| 54 | + await page.goto(url) |
| 55 | + |
| 56 | + const innerHtml = await page.content() |
| 57 | + return innerHtml |
| 58 | +} |
| 59 | + |
| 60 | +async function getHtml(url) { |
| 61 | + const browser = await launchChromium() |
| 62 | + |
| 63 | + try { |
| 64 | + const result = await scrap(browser, url) |
| 65 | + |
| 66 | + return result |
| 67 | + } finally { |
| 68 | + try { |
| 69 | + await browser.close() |
| 70 | + } catch (e) { |
| 71 | + alerter.error('Failed to close browser', e) |
| 72 | + } |
| 73 | + } |
| 74 | +} |
| 75 | + |
| 76 | +module.exports = { |
| 77 | + getHtml, |
| 78 | +} |
0 commit comments