Skip to content

Commit cb2ab48

Browse files
Merge pull request #1 from mazen-r/main
Add screenshot_flags and format api params support
2 parents d39eae8 + 82153ca commit cb2ab48

File tree

4 files changed

+95
-1
lines changed

4 files changed

+95
-1
lines changed

__tests__/scrapeconfig.test.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,27 @@ describe('url param generation', () => {
194194
'screenshots[everything]': 'fullpage',
195195
});
196196
});
197+
it('screenshot flags converted to params', () => {
198+
const config = new ScrapeConfig({
199+
url: 'http://httpbin.dev/get',
200+
screenshots: { everything: 'fullpage' },
201+
screenshot_flags: [
202+
"load_images",
203+
"dark_mode",
204+
"block_banners",
205+
"high_quality",
206+
"print_media_format"
207+
],
208+
render_js: true,
209+
});
210+
expect(config.toApiParams({ key: '1234' })).toEqual({
211+
key: '1234',
212+
url: 'http://httpbin.dev/get',
213+
'screenshots[everything]': 'fullpage',
214+
screenshot_flags: "load_images,dark_mode,block_banners,high_quality,print_media_format",
215+
render_js: true,
216+
});
217+
});
197218
it('asp enables', () => {
198219
const config = new ScrapeConfig({
199220
url: 'http://httpbin.dev/get',
@@ -238,6 +259,17 @@ describe('url param generation', () => {
238259
tags: 'foo,bar,gaz',
239260
});
240261
});
262+
it('format set', () => {
263+
const config = new ScrapeConfig({
264+
url: 'http://httpbin.dev/get',
265+
format: "markdown",
266+
});
267+
expect(config.toApiParams({ key: '1234' })).toEqual({
268+
key: '1234',
269+
url: 'http://httpbin.dev/get',
270+
format: "markdown",
271+
});
272+
});
241273
it('debug sets', () => {
242274
const config = new ScrapeConfig({
243275
url: 'http://httpbin.dev/get',

examples/scrape-as-markdown.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
/*
2+
This example shows how to capture page screenshots with images and additional configuration in scrapfly
3+
*/
4+
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';
5+
6+
const key = 'YOUR SCRAPFLY KEY';
7+
const client = new ScrapflyClient({ key });
8+
const result = await client.scrape(
9+
new ScrapeConfig({
10+
url: 'https://web-scraping.dev/products/',
11+
// scrape the page data as markdown format supproted by LLMs.
12+
// None=raw(unchanged), other supported formats are: json, text, clean_html
13+
format: "markdown"
14+
}),
15+
);
16+
console.log(result.result.content);
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
This example shows how to capture page screenshots with images and additional configuration in scrapfly
3+
*/
4+
import { ScrapflyClient, ScrapeConfig } from 'scrapfly-sdk';
5+
6+
const key = 'YOUR SCRAPFLY KEY';
7+
const client = new ScrapflyClient({ key });
8+
const result = await client.scrape(
9+
new ScrapeConfig({
10+
url: 'https://web-scraping.dev/products/',
11+
// enable headless browsers for screenshots
12+
render_js: true,
13+
// optional: you can wait for page to load before capturing
14+
screenshots: {
15+
everything: 'fullpage',
16+
reviews: '#reviews',
17+
},
18+
screenshot_flags: [
19+
"load_images", // Enable image rendering with the request, adds extra usage for the bandwidth consumed
20+
"dark_mode", // Enable dark mode display
21+
"block_banners", // Block cookies banners and overlay that cover the screen
22+
"high_quality", // No compression on the output image
23+
"print_media_format" // Render the page in the print mode
24+
]
25+
}),
26+
);
27+
console.log(result.result.screenshots);

src/scrapeconfig.ts

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,13 @@ import { log } from './logger.js';
33
import { Rec, HttpMethod } from './types.js';
44
import { ScrapeConfigError } from './errors.js';
55

6+
type ScreenshotFlags = "load_images" | "dark_mode" | "block_banners" | "high_quality" | "print_media_format";
7+
type Format = "raw" | "json" | "text" | "markdown" | "clean_html";
8+
69
export class ScrapeConfig {
710
static PUBLIC_DATACENTER_POOL = 'public_datacenter_pool';
811
static PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool';
9-
12+
1013
url: string;
1114
retry = true;
1215
method: HttpMethod = 'GET';
@@ -24,6 +27,7 @@ export class ScrapeConfig {
2427
proxy_pool?: string = null;
2528
session?: string = null;
2629
tags: Set<string> = new Set<string>();
30+
format?: Format = null; // raw(unchanged)
2731
correlation_id?: string = null;
2832
cookies?: Rec<string> = null;
2933
body?: string = null;
@@ -34,6 +38,7 @@ export class ScrapeConfig {
3438
wait_for_selector?: string = null;
3539
session_sticky_proxy = false;
3640
screenshots?: Rec<any> = null;
41+
screenshot_flags?: ScreenshotFlags[] = null;
3742
webhook?: string = null;
3843
timeout?: number = null; // in milliseconds
3944
js_scenario?: Rec<any> = null;
@@ -60,6 +65,7 @@ export class ScrapeConfig {
6065
proxy_pool?: string;
6166
session?: string;
6267
tags?: Array<string>;
68+
format?: Format;
6369
correlation_id?: string;
6470
cookies?: Rec<string>;
6571
body?: string;
@@ -69,6 +75,7 @@ export class ScrapeConfig {
6975
rendering_wait?: number;
7076
wait_for_selector?: string;
7177
screenshots?: Rec<any>;
78+
screenshot_flags?: ScreenshotFlags[];
7279
session_sticky_proxy?: boolean;
7380
webhook?: string;
7481
timeout?: number; // in milliseconds
@@ -96,6 +103,7 @@ export class ScrapeConfig {
96103
this.proxy_pool = options.proxy_pool ?? this.proxy_pool;
97104
this.session = options.session ?? this.session;
98105
this.tags = new Set(options.tags) ?? this.tags;
106+
this.format = options.format ?? this.format;
99107
this.correlation_id = options.correlation_id ?? this.correlation_id;
100108
this.cookies = options.cookies
101109
? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v]))
@@ -106,6 +114,7 @@ export class ScrapeConfig {
106114
this.rendering_wait = options.rendering_wait ?? this.rendering_wait;
107115
this.wait_for_selector = options.wait_for_selector ?? this.wait_for_selector;
108116
this.screenshots = options.screenshots ?? this.screenshots;
117+
this.screenshot_flags = options.screenshot_flags ?? this.screenshot_flags;
109118
this.webhook = options.webhook ?? this.webhook;
110119
this.timeout = options.timeout ?? this.timeout;
111120
this.js_scenario = options.js_scenario ?? this.js_scenario;
@@ -194,6 +203,13 @@ export class ScrapeConfig {
194203
Object.keys(this.screenshots).forEach((key) => {
195204
params[`screenshots[${key}]`] = this.screenshots[key];
196205
});
206+
if (this.screenshot_flags) {
207+
params.screenshot_flags = this.screenshot_flags.join(',');
208+
}
209+
} else {
210+
if (this.screenshot_flags) {
211+
log.warn('Params "screenshot_flags" is ignored. Works only if screenshots is enabled');
212+
}
197213
}
198214
if (this.auto_scroll !== null) {
199215
params.auto_scroll = this.auto_scroll;
@@ -247,6 +263,9 @@ export class ScrapeConfig {
247263
if (this.tags.size > 0) {
248264
params.tags = Array.from(this.tags).join(',');
249265
}
266+
if (this.format) {
267+
params.format = this.format.valueOf();
268+
}
250269
if (this.correlation_id) {
251270
params.correlation_id = this.correlation_id;
252271
}

0 commit comments

Comments
 (0)