diff --git a/.env.example b/.env.example index bed3c58..d3c451b 100644 --- a/.env.example +++ b/.env.example @@ -5,4 +5,5 @@ FIRECRAWL_API_KEY= SCRAPINGBEE_API_KEY= SCRAPERAPI_API_KEY= TAVILY_API_KEY= -ZYTE_API_KEY= \ No newline at end of file +ZYTE_API_KEY= +OXYLABS_AI_STUDIO_API_KEY= \ No newline at end of file diff --git a/README.md b/README.md index d971b91..2bd7c20 100644 --- a/README.md +++ b/README.md @@ -8,21 +8,22 @@ This framework supports APIs for Firecrawl, Apify, ScraperAPI, ScrapingBee, Zyte Below are evaluation results across different engines. -| Engine | Coverage (Success Rate) (%) | Quality (F1) | -|-----------------|-----------------------------|--------------| -| Firecrawl | 80.9 | 0.68 | -| Exa | 76.3 | 0.53 | -| Tavily | 67.6 | 0.50 | -| ScraperAPI | 63.5 | 0.45 | -| Zyte | 62.9 | 0.47 | -| ScrapingBee | 60.6 | 0.45 | -| Apify | 60.2 | 0.42 | -| Crawl4ai | 58.0 | 0.45 | -| Selenium | 55.0 | 0.40 | -| Scrapy | 54.0 | 0.43 | -| Puppeteer | 53.7 | 0.41 | -| Rest (requests) | 50.6 | 0.36 | -| Playwright | 39.5 | 0.34 | +| Engine | Coverage (Success Rate) (%) | Quality (F1) | +|-------------------|-----------------------------|--------------| +| Firecrawl | 80.9 | 0.68 | +| Oxylabs AI Studio | 80.9 | 0.61 | +| Exa | 76.3 | 0.53 | +| Tavily | 67.6 | 0.50 | +| ScraperAPI | 63.5 | 0.45 | +| Zyte | 62.9 | 0.47 | +| ScrapingBee | 60.6 | 0.45 | +| Apify | 60.2 | 0.42 | +| Crawl4ai | 58.0 | 0.45 | +| Selenium | 55.0 | 0.40 | +| Scrapy | 54.0 | 0.43 | +| Puppeteer | 53.7 | 0.41 | +| Rest (requests) | 50.6 | 0.36 | +| Playwright | 39.5 | 0.34 | ## Install diff --git a/analysis.ipynb b/analysis.ipynb index ec0dbcd..538fabd 100644 --- a/analysis.ipynb +++ b/analysis.ipynb @@ -36,7 +36,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "found: 13 files\n" + "found: 14 files\n" ] } ], @@ -61,7 +61,7 @@ "outputs": [], "source": [ "df['success_perc'] = df['success_rate'] * 100\n", - "df['engine_name'] = df['engine'].str.replace('_api', '').str.replace('_scraper', '').str.capitalize().str.replace('Scrapingbee','ScrapingBee')\n" + "df['engine_name'] = df['engine'].str.replace('_api', '').str.replace('_scraper', '').str.capitalize().str.replace('Scrapingbee','ScrapingBee').str.replace('Oxylabs_ai', 'Oxylabs AI Studio')\n" ] }, { @@ -230,6 +230,16 @@ " 60.640732\n", " ScrapingBee\n", " \n", + " \n", + " 13\n", + " 0.608424\n", + " 0.616571\n", + " 0.607602\n", + " 0.809000\n", + " oxylabs_ai_scraper\n", + " 80.900000\n", + " Oxylabs AI Studio\n", + " \n", " \n", "\n", "" @@ -249,24 +259,26 @@ "10 0.442768 0.472855 0.453349 0.580000 crawl4ai_scraper \n", "11 0.396912 0.419732 0.404574 0.550403 selenium_scraper \n", "12 0.443617 0.466697 0.450543 0.606407 scrapingbee_api \n", + "13 0.608424 0.616571 0.607602 0.809000 oxylabs_ai_scraper \n", "\n", - " success_perc engine_name \n", - "0 54.000000 Scrapy \n", - "1 53.665284 Puppeteer \n", - "2 63.513514 Scraperapi \n", - "3 80.900000 Firecrawl \n", - "4 62.878788 Zyte \n", - "5 76.300000 Exa \n", - "6 39.500000 Playwright \n", - "7 50.600000 Rest \n", - "8 67.600000 Tavily \n", - "9 60.215054 Apify \n", - "10 58.000000 Crawl4ai \n", - "11 55.040323 Selenium \n", - "12 60.640732 ScrapingBee " + " success_perc engine_name \n", + "0 54.000000 Scrapy \n", + "1 53.665284 Puppeteer \n", + "2 63.513514 Scraperapi \n", + "3 80.900000 Firecrawl \n", + "4 62.878788 Zyte \n", + "5 76.300000 Exa \n", + "6 39.500000 Playwright \n", + "7 50.600000 Rest \n", + "8 67.600000 Tavily \n", + "9 60.215054 Apify \n", + "10 58.000000 Crawl4ai \n", + "11 55.040323 Selenium \n", + "12 60.640732 ScrapingBee \n", + "13 80.900000 Oxylabs AI Studio " ] }, - "execution_count": 41, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -318,6 +330,30 @@ }, "yaxis": "y" }, + { + "hovertemplate": "engine_name=%{x}
success_perc=%{y}", + "legendgroup": "Oxylabs AI Studio", + "marker": { + "color": "#EDEDED", + "pattern": { + "shape": "" + } + }, + "name": "Oxylabs AI Studio", + "orientation": "v", + "showlegend": true, + "textposition": "auto", + "type": "bar", + "x": [ + "Oxylabs AI Studio" + ], + "xaxis": "x", + "y": { + "bdata": "mpmZmZk5VEA=", + "dtype": "f8" + }, + "yaxis": "y" + }, { "hovertemplate": "engine_name=%{x}
success_perc=%{y}", "legendgroup": "Exa", @@ -1407,6 +1443,7 @@ "anchor": "y", "categoryarray": [ "Firecrawl", + "Oxylabs AI Studio", "Exa", "Tavily", "Scraperapi", @@ -1531,6 +1568,30 @@ }, "yaxis": "y" }, + { + "hovertemplate": "engine_name=%{x}
avg_f1=%{y}", + "legendgroup": "Oxylabs AI Studio", + "marker": { + "color": "#EDEDED", + "pattern": { + "shape": "" + } + }, + "name": "Oxylabs AI Studio", + "orientation": "v", + "showlegend": true, + "textposition": "auto", + "type": "bar", + "x": [ + "Oxylabs AI Studio" + ], + "xaxis": "x", + "y": { + "bdata": "tgn/6Hhx4z8=", + "dtype": "f8" + }, + "yaxis": "y" + }, { "hovertemplate": "engine_name=%{x}
avg_f1=%{y}", "legendgroup": "Exa", @@ -2620,6 +2681,7 @@ "anchor": "y", "categoryarray": [ "Firecrawl", + "Oxylabs AI Studio", "Exa", "Tavily", "Zyte", @@ -2757,6 +2819,14 @@ " 0.675905\n", " \n", " \n", + " 13\n", + " Oxylabs AI Studio\n", + " 80.900000\n", + " 0.607602\n", + " 0.608424\n", + " 0.616571\n", + " \n", + " \n", " 5\n", " Exa\n", " 76.300000\n", @@ -2857,23 +2927,40 @@ "" ], "text/plain": [ - " Engine Coverage (Success Rate %) Quality (F1) Recall Precision\n", - "3 Firecrawl 80.900000 0.675777 0.678596 0.675905\n", - "5 Exa 76.300000 0.526801 0.506211 0.561197\n", - "8 Tavily 67.600000 0.501143 0.498865 0.508261\n", - "2 Scraperapi 63.513514 0.449829 0.442166 0.466340\n", - "4 Zyte 62.878788 0.468174 0.457769 0.487554\n", - "12 ScrapingBee 60.640732 0.450543 0.443617 0.466697\n", - "9 Apify 60.215054 0.416620 0.408899 0.430915\n", - "10 Crawl4ai 58.000000 0.453349 0.442768 0.472855\n", - "11 Selenium 55.040323 0.404574 0.396912 0.419732\n", - "0 Scrapy 54.000000 0.428988 0.418981 0.448316\n", - "1 Puppeteer 53.665284 0.408257 0.400046 0.423888\n", - "7 Rest 50.600000 0.354953 0.346279 0.377394\n", - "6 Playwright 39.500000 0.338666 0.331453 0.352773" + " Engine Coverage (Success Rate %) Quality (F1) Recall \\\n", + "3 Firecrawl 80.900000 0.675777 0.678596 \n", + "13 Oxylabs AI Studio 80.900000 0.607602 0.608424 \n", + "5 Exa 76.300000 0.526801 0.506211 \n", + "8 Tavily 67.600000 0.501143 0.498865 \n", + "2 Scraperapi 63.513514 0.449829 0.442166 \n", + "4 Zyte 62.878788 0.468174 0.457769 \n", + "12 ScrapingBee 60.640732 0.450543 0.443617 \n", + "9 Apify 60.215054 0.416620 0.408899 \n", + "10 Crawl4ai 58.000000 0.453349 0.442768 \n", + "11 Selenium 55.040323 0.404574 0.396912 \n", + "0 Scrapy 54.000000 0.428988 0.418981 \n", + "1 Puppeteer 53.665284 0.408257 0.400046 \n", + "7 Rest 50.600000 0.354953 0.346279 \n", + "6 Playwright 39.500000 0.338666 0.331453 \n", + "\n", + " Precision \n", + "3 0.675905 \n", + "13 0.616571 \n", + "5 0.561197 \n", + "8 0.508261 \n", + "2 0.466340 \n", + "4 0.487554 \n", + "12 0.466697 \n", + "9 0.430915 \n", + "10 0.472855 \n", + "11 0.419732 \n", + "0 0.448316 \n", + "1 0.423888 \n", + "7 0.377394 \n", + "6 0.352773 " ] }, - "execution_count": 50, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } diff --git a/engines/base.py b/engines/base.py index 68e5cc4..647644f 100644 --- a/engines/base.py +++ b/engines/base.py @@ -38,6 +38,9 @@ class ScrapeResult(TypedDict, total=False): # Tavily "tavily_api", + # Oxylabs AI Scraper + "oxylabs_ai_scraper", + # Zyte "zyte_api", ] diff --git a/engines/oxylabs_ai_scraper.py b/engines/oxylabs_ai_scraper.py new file mode 100644 index 0000000..7ea8d14 --- /dev/null +++ b/engines/oxylabs_ai_scraper.py @@ -0,0 +1,63 @@ +import os +from datetime import datetime +from typing import Optional + +from dotenv import load_dotenv +from oxylabs_ai_studio.apps.ai_scraper import AiScraper + +from .base import Scraper, ScrapeResult + +load_dotenv() + +class OxylabsAiScraper(Scraper): + """Scrapes web pages using the Oxylabs AI Scraper asynchronously.""" + + def __init__(self): + self.api_key = os.getenv("OXYLABS_AI_STUDIO_API_KEY") + if not self.api_key: + raise ValueError("OXYLABS_AI_STUDIO_API_KEY not set in environment.") + self.scraper = AiScraper(api_key=self.api_key) + + async def scrape(self, url: str, run_id: str) -> ScrapeResult: + try: + result = await self.scraper.scrape_async( + url=url, + output_format="markdown", + render_javascript=False, + ) + + # Extract content from result.data + content: Optional[str] = None + + if isinstance(result.data, str): + content = result.data + elif isinstance(result.data, dict): + # Fallback if it returns a dict + content = result.data.get("content") or result.data.get("markdown") or str(result.data) + + content_size = len(content.encode("utf-8")) if content else 0 + + return ScrapeResult( + run_id=run_id, + scraper="oxylabs_ai_scraper", + url=url, + status_code=200, # Assumed success if no exception raised + error=result.message if result.message and not content else None, + content_size=content_size, + format="markdown", + created_at=datetime.now().isoformat(), + content=content, + ) + + except Exception as e: + return ScrapeResult( + run_id=run_id, + scraper="oxylabs_ai_scraper", + url=url, + status_code=500, + error=str(e), + content_size=0, + format="markdown", + created_at=datetime.now().isoformat(), + content=None, + ) diff --git a/requirements.txt b/requirements.txt index e606582..3206590 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ selenium exa-py tavily-python plotly -nbformat \ No newline at end of file +nbformat +oxylabs-ai-studio \ No newline at end of file diff --git a/runs/results/oxylabs_ai_scraper_quality.json b/runs/results/oxylabs_ai_scraper_quality.json new file mode 100644 index 0000000..08efed7 --- /dev/null +++ b/runs/results/oxylabs_ai_scraper_quality.json @@ -0,0 +1,6 @@ +{ + "success_rate": 0.809, + "avg_recall": 0.6084242261193343, + "avg_precision": 0.6165712309014437, + "avg_f1": 0.6076015997608042 +} \ No newline at end of file