Skip to content

Commit b737bf9

Browse files
committed
add cheerio support
1 parent 690ec37 commit b737bf9

15 files changed

+489
-92
lines changed

.eslintrc.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"es6": true,
55
"node": true
66
},
7-
"ignorePatterns": ["**/*.d.ts"],
7+
"ignorePatterns": ["**/*.d.ts", "examples/**/*"],
88
"parser": "@typescript-eslint/parser",
99
"parserOptions": {
1010
"project": "tsconfig.json",

README.md

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,18 @@
33
`npm install scrapfly-sdk`
44

55
Typescript/NodeJS SDK for [Scrapfly.io](https://scrapfly.io/) web scraping API which allows to:
6-
- Scrape the web without being blocked.
7-
- Use headless browsers to access Javascript-powered page data.
8-
- Scale up web scraping.
9-
- ... and [much more](https://scrapfly.io/docs/scrape-api/getting-started)!
6+
7+
- Scrape the web without being blocked.
8+
- Use headless browsers to access Javascript-powered page data.
9+
- Scale up web scraping.
10+
- ... and [much more](https://scrapfly.io/docs/scrape-api/getting-started)!
1011

1112
For web scraping guides see [our blog](https://scrapfly.io/blog/) and [#scrapeguide](https://scrapfly.io/blog/tag/scrapeguide/) tag for how to scrape specific targets.
1213

1314
## Quick Intro
1415

1516
1. Register a [Scrapfly account for free](https://scrapfly.io/register)
16-
2. Get your API Key on [scrapfly.io/dashboard](https://scrapfly.io/dashboard)
17+
2. Get your API Key on [scrapfly.io/dashboard](https://scrapfly.io/dashboard)
1718
3. Start scraping: 🚀
1819

1920
```javascript
@@ -43,6 +44,24 @@ For more see [/examples](/examples/) directory.
4344
For more on Scrapfly API see our [getting started documentation](https://scrapfly.io/docs/scrape-api/getting-started)
4445
For Python see [Scrapfly Python SDK](https://github.com/scrapfly/python-scrapfly)
4546

47+
## Debugging
48+
49+
To enable debug logs set Scrapfly's log level to `"DEBUG"`:
50+
51+
```javascript
52+
log.setLevel('DEBUG');
53+
```
54+
55+
Additionally, set `debug=true` in `ScrapeConfig` to access debug information in [Scrapfly web dashboard](https://scrapfly.io/dashboard):
56+
57+
```typescript
58+
new ScrapeConfig({
59+
url: 'https://web-scraping.dev/product/1',
60+
debug: true,
61+
// ^ enable debug information
62+
});
63+
```
64+
4665
## Development
4766

4867
Install and setup environment:

__tests__/client.test.ts

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import axios from 'axios';
2+
import { AxiosRequestConfig } from 'axios';
23
import { ScrapflyClient } from '../src/client.js';
34
import * as errors from '../src/errors.js';
45
import { ScrapeConfig } from '../src/scrapeconfig.js';
6+
import { describe, it, expect, jest, beforeEach } from '@jest/globals';
57

68
jest.mock('axios');
79

@@ -37,7 +39,7 @@ describe('concurrent scrape', () => {
3739
// mock axios to return /account data and 2 types of results:
3840
// - success for /success endpoints
3941
// - ASP failure for /failure endpoints
40-
mockedAxios.request.mockImplementation(async (config) => {
42+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
4143
if (config.url.includes('/account')) {
4244
return {
4345
status: 200,
@@ -123,7 +125,7 @@ describe('scrape', () => {
123125

124126
it('GET success', async () => {
125127
const url = 'https://httpbin.dev/json';
126-
mockedAxios.request.mockImplementation(async (config) => {
128+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
127129
// Ensure the URL matches the pattern
128130
expect(config.url).toMatch(client.HOST + '/scrape');
129131
expect(config.method).toEqual('GET');
@@ -147,7 +149,7 @@ describe('scrape', () => {
147149

148150
it('GET complex urls', async () => {
149151
const url = 'https://httpbin.dev/anything/?website=https://httpbin.dev/anything';
150-
mockedAxios.request.mockImplementation(async (config) => {
152+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
151153
// Ensure the URL matches the pattern
152154
expect(config.url).toMatch(client.HOST + '/scrape');
153155
expect(config.method).toEqual('GET');
@@ -171,7 +173,7 @@ describe('scrape', () => {
171173

172174
it('POST success', async () => {
173175
const url = 'https://httpbin.dev/json';
174-
mockedAxios.request.mockImplementation(async (config) => {
176+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
175177
// Ensure the URL matches the pattern
176178
expect(config.url).toMatch(client.HOST + '/scrape');
177179
expect(config.method).toEqual('POST');
@@ -233,7 +235,7 @@ describe('client errors', () => {
233235

234236
it('raises ApiHttpServerError on 500 and success', async () => {
235237
const url = 'https://httpbin.dev/json';
236-
mockedAxios.request.mockImplementation(async (config) => {
238+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
237239
return resultFactory({
238240
url: config.url,
239241
status_code: 500,
@@ -247,7 +249,7 @@ describe('client errors', () => {
247249

248250
it('raises BadApiKeyError on 401', async () => {
249251
const url = 'https://httpbin.dev/json';
250-
mockedAxios.request.mockImplementation(async (config) => {
252+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
251253
return resultFactory({
252254
url: config.url,
253255
status_code: 401,
@@ -260,7 +262,7 @@ describe('client errors', () => {
260262
});
261263
it('raises TooManyRequests on 429 and success', async () => {
262264
const url = 'https://httpbin.dev/json';
263-
mockedAxios.request.mockImplementation(async (config) => {
265+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
264266
return resultFactory({
265267
url: config.url,
266268
status_code: 429,
@@ -271,7 +273,7 @@ describe('client errors', () => {
271273
await expect(client.scrape(new ScrapeConfig({ url }))).rejects.toThrow(errors.TooManyRequests);
272274
});
273275
it('raises ScrapflyScrapeError on ::SCRAPE:: resource and success', async () => {
274-
mockedAxios.request.mockImplementation(async (config) => {
276+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
275277
return resultFactory({
276278
url: config.url,
277279
status: 'ERR::SCRAPE::BAD_PROTOCOL',
@@ -284,7 +286,7 @@ describe('client errors', () => {
284286
});
285287

286288
it('raises ScrapflyWebhookError on ::WEBHOOK:: resource and success', async () => {
287-
mockedAxios.request.mockImplementation(async (config) => {
289+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
288290
return resultFactory({
289291
url: config.url,
290292
status: 'ERR::WEBHOOK::DISABLED ',
@@ -296,7 +298,7 @@ describe('client errors', () => {
296298
);
297299
});
298300
it('raises ScrapflyProxyError on ERR::PROXY::POOL_NOT_FOUND resource and success', async () => {
299-
mockedAxios.request.mockImplementation(async (config) => {
301+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
300302
return resultFactory({
301303
url: config.url,
302304
status: 'ERR::PROXY::POOL_NOT_FOUND ',
@@ -309,7 +311,7 @@ describe('client errors', () => {
309311
});
310312

311313
it('raises ScrapflyScheduleError on ERR::SCHEDULE::DISABLED resource and success', async () => {
312-
mockedAxios.request.mockImplementation(async (config) => {
314+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
313315
return resultFactory({
314316
url: config.url,
315317
status: 'ERR::SCHEDULE::DISABLED',
@@ -322,7 +324,7 @@ describe('client errors', () => {
322324
});
323325

324326
it('raises ScrapflyAspError on ERR::ASP::SHIELD_ERROR resource and success', async () => {
325-
mockedAxios.request.mockImplementation(async (config) => {
327+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
326328
return resultFactory({
327329
url: config.url,
328330
status: 'ERR::ASP::SHIELD_ERROR',
@@ -335,7 +337,7 @@ describe('client errors', () => {
335337
});
336338

337339
it('raises ScrapflySessionError on ERR::SESSION::CONCURRENT_ACCESS resource and success', async () => {
338-
mockedAxios.request.mockImplementation(async (config) => {
340+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
339341
return resultFactory({
340342
url: config.url,
341343
status: 'ERR::SESSION::CONCURRENT_ACCESS',
@@ -348,7 +350,7 @@ describe('client errors', () => {
348350
});
349351

350352
it('raises ApiHttpClientError on success and unknown status', async () => {
351-
mockedAxios.request.mockImplementation(async (config) => {
353+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
352354
return resultFactory({
353355
url: config.url,
354356
status: 'ERR::NEW',
@@ -360,7 +362,7 @@ describe('client errors', () => {
360362
);
361363
});
362364
it('raises UpstreamHttpServerError on failure, ERR::SCRAPE::BAD_UPSTREAM_RESPONSE and >=500', async () => {
363-
mockedAxios.request.mockImplementation(async (config) => {
365+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
364366
return resultFactory({
365367
url: config.url,
366368
success: false,
@@ -373,7 +375,7 @@ describe('client errors', () => {
373375
);
374376
});
375377
it('raises UpstreamHttpClientError on failure, ERR::SCRAPE::BAD_UPSTREAM_RESPONSE and 4xx status', async () => {
376-
mockedAxios.request.mockImplementation(async (config) => {
378+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
377379
return resultFactory({
378380
url: config.url,
379381
success: false,
@@ -396,7 +398,7 @@ describe('client errors', () => {
396398
SESSION: errors.ScrapflySessionError,
397399
};
398400
for (const [resource, err] of Object.entries(resourceErrMap)) {
399-
mockedAxios.request.mockImplementation(async (config) => {
401+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
400402
return resultFactory({
401403
url: config.url,
402404
success: false,
@@ -408,7 +410,7 @@ describe('client errors', () => {
408410
});
409411

410412
it('raises ScrapflyError on unhandled failure', async () => {
411-
mockedAxios.request.mockImplementation(async (config) => {
413+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
412414
return resultFactory({
413415
url: config.url,
414416
success: false,
@@ -421,7 +423,7 @@ describe('client errors', () => {
421423
);
422424
});
423425
it('raises on unhandled failure', async () => {
424-
mockedAxios.request.mockImplementation(async (config) => {
426+
mockedAxios.request.mockImplementation(async (config: AxiosRequestConfig): Promise<any> => {
425427
return resultFactory({
426428
url: config.url,
427429
success: false,

__tests__/result.test.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import * as fs from 'fs';
2+
import { ScrapeResult } from '../src/result.js';
3+
import * as errors from '../src/errors.js';
4+
import { describe, it, expect } from '@jest/globals';
5+
6+
describe('cheerio selector', () => {
7+
it('lazy loads and caches itself', () => {
8+
const response = JSON.parse(fs.readFileSync('__tests__/data/response_html_success.json', 'utf8'));
9+
const result = new ScrapeResult(response);
10+
expect(result.selector('h1').text()).toEqual('Herman Melville - Moby-Dick');
11+
// make sure calling it twice performs the same
12+
expect(result.selector('h1').text()).toEqual('Herman Melville - Moby-Dick');
13+
});
14+
it('throws ContentTypeError when accessing .selector on JSON data', () => {
15+
const response = JSON.parse(fs.readFileSync('__tests__/data/response_json_success.json', 'utf8'));
16+
const result = new ScrapeResult(response);
17+
expect(() => {
18+
result.selector('h1').text();
19+
}).toThrow(errors.ContentTypeError);
20+
});
21+
});

__tests__/scrapeconfig.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { ScrapeConfig } from '../src/scrapeconfig.js';
22
import { HttpMethod } from '../src/types.js';
33
import { ScrapeConfigError } from '../src/errors.js';
4+
import { describe, it, expect } from '@jest/globals';
45

56
describe('scrapeconfig', () => {
67
it('loads', () => {

__tests__/utils.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { urlsafe_b64encode } from '../src/utils.js';
2+
import { describe, it, expect } from '@jest/globals';
23

34
describe('urlsafe_b64encode', () => {
45
it('should encode a string to base64', () => {

0 commit comments

Comments
 (0)