Skip to content

Commit 82153ca

Browse files
committed
update scrape config param types and add examples
1 parent dd709bb commit 82153ca

File tree

4 files changed

+60
-8
lines changed

4 files changed

+60
-8
lines changed

__tests__/scrapeconfig.test.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,13 @@ describe('url param generation', () => {
198198
const config = new ScrapeConfig({
199199
url: 'http://httpbin.dev/get',
200200
screenshots: { everything: 'fullpage' },
201-
screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format",
201+
screenshot_flags: [
202+
"load_images",
203+
"dark_mode",
204+
"block_banners",
205+
"high_quality",
206+
"print_media_format"
207+
],
202208
render_js: true,
203209
});
204210
expect(config.toApiParams({ key: '1234' })).toEqual({

examples/scrape-as-markdown.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/*
2+
This example shows how to capture page screenshots with images and additional configuration in scrapfly
3+
*/
4+
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';
5+
6+
const key = 'YOUR SCRAPFLY KEY';
7+
const client = new ScrapflyClient({ key });
8+
const result = await client.scrape(
9+
new ScrapeConfig({
10+
url: 'https://web-scraping.dev/products/',
11+
// scrape the page data as markdown format supproted by LLMs.
12+
// None=raw(unchanged), other supported formats are: json, text, clean_html
13+
format: "markdown"
14+
}),
15+
);
16+
console.log(result.result.content);
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
This example shows how to capture page screenshots with images and additional configuration in scrapfly
3+
*/
4+
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';
5+
6+
const key = 'YOUR SCRAPFLY KEY';
7+
const client = new ScrapflyClient({ key });
8+
const result = await client.scrape(
9+
new ScrapeConfig({
10+
url: 'https://web-scraping.dev/products/',
11+
// enable headless browsers for screenshots
12+
render_js: true,
13+
// optional: you can wait for page to load before capturing
14+
screenshots: {
15+
everything: 'fullpage',
16+
reviews: '#reviews',
17+
},
18+
screenshot_flags: [
19+
"load_images", // Enable image rendering with the request, adds extra usage for the bandwidth consumed
20+
"dark_mode", // Enable dark mode display
21+
"block_banners", // Block cookies banners and overlay that cover the screen
22+
"high_quality", // No compression on the output image
23+
"print_media_format" // Render the page in the print mode
24+
]
25+
}),
26+
);
27+
console.log(result.result.screenshots);

src/scrapeconfig.ts

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@ import { log } from './logger.js';
33
import { Rec, HttpMethod } from './types.js';
44
import { ScrapeConfigError } from './errors.js';
55

6+
type ScreenshotFlags = "load_images" | "dark_mode" | "block_banners" | "high_quality" | "print_media_format";
7+
type Format = "raw" | "json" | "text" | "markdown" | "clean_html";
8+
69
export class ScrapeConfig {
710
static PUBLIC_DATACENTER_POOL = 'public_datacenter_pool';
811
static PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool';
9-
12+
1013
url: string;
1114
retry = true;
1215
method: HttpMethod = 'GET';
@@ -24,7 +27,7 @@ export class ScrapeConfig {
2427
proxy_pool?: string = null;
2528
session?: string = null;
2629
tags: Set<string> = new Set<string>();
27-
format?: string = null; // raw(unchanged)
30+
format?: Format = null; // raw(unchanged)
2831
correlation_id?: string = null;
2932
cookies?: Rec<string> = null;
3033
body?: string = null;
@@ -35,7 +38,7 @@ export class ScrapeConfig {
3538
wait_for_selector?: string = null;
3639
session_sticky_proxy = false;
3740
screenshots?: Rec<any> = null;
38-
screenshot_flags?: string = null;
41+
screenshot_flags?: ScreenshotFlags[] = null;
3942
webhook?: string = null;
4043
timeout?: number = null; // in milliseconds
4144
js_scenario?: Rec<any> = null;
@@ -62,7 +65,7 @@ export class ScrapeConfig {
6265
proxy_pool?: string;
6366
session?: string;
6467
tags?: Array<string>;
65-
format?: string;
68+
format?: Format;
6669
correlation_id?: string;
6770
cookies?: Rec<string>;
6871
body?: string;
@@ -72,7 +75,7 @@ export class ScrapeConfig {
7275
rendering_wait?: number;
7376
wait_for_selector?: string;
7477
screenshots?: Rec<any>;
75-
screenshot_flags?: string;
78+
screenshot_flags?: ScreenshotFlags[];
7679
session_sticky_proxy?: boolean;
7780
webhook?: string;
7881
timeout?: number; // in milliseconds
@@ -201,7 +204,7 @@ export class ScrapeConfig {
201204
params[`screenshots[${key}]`] = this.screenshots[key];
202205
});
203206
if (this.screenshot_flags) {
204-
params.screenshot_flags = this.screenshot_flags;
207+
params.screenshot_flags = this.screenshot_flags.join(',');
205208
}
206209
} else {
207210
if (this.screenshot_flags) {
@@ -261,7 +264,7 @@ export class ScrapeConfig {
261264
params.tags = Array.from(this.tags).join(',');
262265
}
263266
if (this.format) {
264-
params.format = this.format;
267+
params.format = this.format.valueOf();
265268
}
266269
if (this.correlation_id) {
267270
params.correlation_id = this.correlation_id;

0 commit comments

Comments
 (0)