Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ FIRECRAWL_API_KEY=<your Firecrawl api key here>
SCRAPINGBEE_API_KEY=<your ScrapingBee api key here>
SCRAPERAPI_API_KEY=<your ScraperAPI api key here>
TAVILY_API_KEY=<your Tavily api key here>
ZYTE_API_KEY=<your ZYTE api key>
ZYTE_API_KEY=<your ZYTE api key>
OXYLABS_AI_STUDIO_API_KEY=<your Oxylabs api key>
31 changes: 16 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,22 @@ This framework supports APIs for Firecrawl, Apify, ScraperAPI, ScrapingBee, Zyte

Below are evaluation results across different engines.

| Engine | Coverage (Success Rate) (%) | Quality (F1) |
|-----------------|-----------------------------|--------------|
| Firecrawl | 80.9 | 0.68 |
| Exa | 76.3 | 0.53 |
| Tavily | 67.6 | 0.50 |
| ScraperAPI | 63.5 | 0.45 |
| Zyte | 62.9 | 0.47 |
| ScrapingBee | 60.6 | 0.45 |
| Apify | 60.2 | 0.42 |
| Crawl4ai | 58.0 | 0.45 |
| Selenium | 55.0 | 0.40 |
| Scrapy | 54.0 | 0.43 |
| Puppeteer | 53.7 | 0.41 |
| Rest (requests) | 50.6 | 0.36 |
| Playwright | 39.5 | 0.34 |
| Engine | Coverage (Success Rate) (%) | Quality (F1) |
|-------------------|-----------------------------|--------------|
| Firecrawl | 80.9 | 0.68 |
| Oxylabs AI Studio | 80.9 | 0.61 |
| Exa | 76.3 | 0.53 |
| Tavily | 67.6 | 0.50 |
| ScraperAPI | 63.5 | 0.45 |
| Zyte | 62.9 | 0.47 |
| ScrapingBee | 60.6 | 0.45 |
| Apify | 60.2 | 0.42 |
| Crawl4ai | 58.0 | 0.45 |
| Selenium | 55.0 | 0.40 |
| Scrapy | 54.0 | 0.43 |
| Puppeteer | 53.7 | 0.41 |
| Rest (requests) | 50.6 | 0.36 |
| Playwright | 39.5 | 0.34 |

## Install

Expand Down
151 changes: 119 additions & 32 deletions analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"found: 13 files\n"
"found: 14 files\n"
]
}
],
Expand All @@ -61,7 +61,7 @@
"outputs": [],
"source": [
"df['success_perc'] = df['success_rate'] * 100\n",
"df['engine_name'] = df['engine'].str.replace('_api', '').str.replace('_scraper', '').str.capitalize().str.replace('Scrapingbee','ScrapingBee')\n"
"df['engine_name'] = df['engine'].str.replace('_api', '').str.replace('_scraper', '').str.capitalize().str.replace('Scrapingbee','ScrapingBee').str.replace('Oxylabs_ai', 'Oxylabs AI Studio')\n"
]
},
{
Expand Down Expand Up @@ -230,6 +230,16 @@
" <td>60.640732</td>\n",
" <td>ScrapingBee</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0.608424</td>\n",
" <td>0.616571</td>\n",
" <td>0.607602</td>\n",
" <td>0.809000</td>\n",
" <td>oxylabs_ai_scraper</td>\n",
" <td>80.900000</td>\n",
" <td>Oxylabs AI Studio</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
Expand All @@ -249,24 +259,26 @@
"10 0.442768 0.472855 0.453349 0.580000 crawl4ai_scraper \n",
"11 0.396912 0.419732 0.404574 0.550403 selenium_scraper \n",
"12 0.443617 0.466697 0.450543 0.606407 scrapingbee_api \n",
"13 0.608424 0.616571 0.607602 0.809000 oxylabs_ai_scraper \n",
"\n",
" success_perc engine_name \n",
"0 54.000000 Scrapy \n",
"1 53.665284 Puppeteer \n",
"2 63.513514 Scraperapi \n",
"3 80.900000 Firecrawl \n",
"4 62.878788 Zyte \n",
"5 76.300000 Exa \n",
"6 39.500000 Playwright \n",
"7 50.600000 Rest \n",
"8 67.600000 Tavily \n",
"9 60.215054 Apify \n",
"10 58.000000 Crawl4ai \n",
"11 55.040323 Selenium \n",
"12 60.640732 ScrapingBee "
" success_perc engine_name \n",
"0 54.000000 Scrapy \n",
"1 53.665284 Puppeteer \n",
"2 63.513514 Scraperapi \n",
"3 80.900000 Firecrawl \n",
"4 62.878788 Zyte \n",
"5 76.300000 Exa \n",
"6 39.500000 Playwright \n",
"7 50.600000 Rest \n",
"8 67.600000 Tavily \n",
"9 60.215054 Apify \n",
"10 58.000000 Crawl4ai \n",
"11 55.040323 Selenium \n",
"12 60.640732 ScrapingBee \n",
"13 80.900000 Oxylabs AI Studio "
]
},
"execution_count": 41,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -318,6 +330,30 @@
},
"yaxis": "y"
},
{
"hovertemplate": "engine_name=%{x}<br>success_perc=%{y}<extra></extra>",
"legendgroup": "Oxylabs AI Studio",
"marker": {
"color": "#EDEDED",
"pattern": {
"shape": ""
}
},
"name": "Oxylabs AI Studio",
"orientation": "v",
"showlegend": true,
"textposition": "auto",
"type": "bar",
"x": [
"Oxylabs AI Studio"
],
"xaxis": "x",
"y": {
"bdata": "mpmZmZk5VEA=",
"dtype": "f8"
},
"yaxis": "y"
},
{
"hovertemplate": "engine_name=%{x}<br>success_perc=%{y}<extra></extra>",
"legendgroup": "Exa",
Expand Down Expand Up @@ -1407,6 +1443,7 @@
"anchor": "y",
"categoryarray": [
"Firecrawl",
"Oxylabs AI Studio",
"Exa",
"Tavily",
"Scraperapi",
Expand Down Expand Up @@ -1531,6 +1568,30 @@
},
"yaxis": "y"
},
{
"hovertemplate": "engine_name=%{x}<br>avg_f1=%{y}<extra></extra>",
"legendgroup": "Oxylabs AI Studio",
"marker": {
"color": "#EDEDED",
"pattern": {
"shape": ""
}
},
"name": "Oxylabs AI Studio",
"orientation": "v",
"showlegend": true,
"textposition": "auto",
"type": "bar",
"x": [
"Oxylabs AI Studio"
],
"xaxis": "x",
"y": {
"bdata": "tgn/6Hhx4z8=",
"dtype": "f8"
},
"yaxis": "y"
},
{
"hovertemplate": "engine_name=%{x}<br>avg_f1=%{y}<extra></extra>",
"legendgroup": "Exa",
Expand Down Expand Up @@ -2620,6 +2681,7 @@
"anchor": "y",
"categoryarray": [
"Firecrawl",
"Oxylabs AI Studio",
"Exa",
"Tavily",
"Zyte",
Expand Down Expand Up @@ -2757,6 +2819,14 @@
" <td>0.675905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Oxylabs AI Studio</td>\n",
" <td>80.900000</td>\n",
" <td>0.607602</td>\n",
" <td>0.608424</td>\n",
" <td>0.616571</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Exa</td>\n",
" <td>76.300000</td>\n",
Expand Down Expand Up @@ -2857,23 +2927,40 @@
"</div>"
],
"text/plain": [
" Engine Coverage (Success Rate %) Quality (F1) Recall Precision\n",
"3 Firecrawl 80.900000 0.675777 0.678596 0.675905\n",
"5 Exa 76.300000 0.526801 0.506211 0.561197\n",
"8 Tavily 67.600000 0.501143 0.498865 0.508261\n",
"2 Scraperapi 63.513514 0.449829 0.442166 0.466340\n",
"4 Zyte 62.878788 0.468174 0.457769 0.487554\n",
"12 ScrapingBee 60.640732 0.450543 0.443617 0.466697\n",
"9 Apify 60.215054 0.416620 0.408899 0.430915\n",
"10 Crawl4ai 58.000000 0.453349 0.442768 0.472855\n",
"11 Selenium 55.040323 0.404574 0.396912 0.419732\n",
"0 Scrapy 54.000000 0.428988 0.418981 0.448316\n",
"1 Puppeteer 53.665284 0.408257 0.400046 0.423888\n",
"7 Rest 50.600000 0.354953 0.346279 0.377394\n",
"6 Playwright 39.500000 0.338666 0.331453 0.352773"
" Engine Coverage (Success Rate %) Quality (F1) Recall \\\n",
"3 Firecrawl 80.900000 0.675777 0.678596 \n",
"13 Oxylabs AI Studio 80.900000 0.607602 0.608424 \n",
"5 Exa 76.300000 0.526801 0.506211 \n",
"8 Tavily 67.600000 0.501143 0.498865 \n",
"2 Scraperapi 63.513514 0.449829 0.442166 \n",
"4 Zyte 62.878788 0.468174 0.457769 \n",
"12 ScrapingBee 60.640732 0.450543 0.443617 \n",
"9 Apify 60.215054 0.416620 0.408899 \n",
"10 Crawl4ai 58.000000 0.453349 0.442768 \n",
"11 Selenium 55.040323 0.404574 0.396912 \n",
"0 Scrapy 54.000000 0.428988 0.418981 \n",
"1 Puppeteer 53.665284 0.408257 0.400046 \n",
"7 Rest 50.600000 0.354953 0.346279 \n",
"6 Playwright 39.500000 0.338666 0.331453 \n",
"\n",
" Precision \n",
"3 0.675905 \n",
"13 0.616571 \n",
"5 0.561197 \n",
"8 0.508261 \n",
"2 0.466340 \n",
"4 0.487554 \n",
"12 0.466697 \n",
"9 0.430915 \n",
"10 0.472855 \n",
"11 0.419732 \n",
"0 0.448316 \n",
"1 0.423888 \n",
"7 0.377394 \n",
"6 0.352773 "
]
},
"execution_count": 50,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
3 changes: 3 additions & 0 deletions engines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class ScrapeResult(TypedDict, total=False):
# Tavily
"tavily_api",

# Oxylabs AI Scraper
"oxylabs_ai_scraper",

# Zyte
"zyte_api",
]
Expand Down
63 changes: 63 additions & 0 deletions engines/oxylabs_ai_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import os
from datetime import datetime
from typing import Optional

from dotenv import load_dotenv
from oxylabs_ai_studio.apps.ai_scraper import AiScraper

from .base import Scraper, ScrapeResult

load_dotenv()

class OxylabsAiScraper(Scraper):
"""Scrapes web pages using the Oxylabs AI Scraper asynchronously."""

def __init__(self):
self.api_key = os.getenv("OXYLABS_AI_STUDIO_API_KEY")
if not self.api_key:
raise ValueError("OXYLABS_AI_STUDIO_API_KEY not set in environment.")
self.scraper = AiScraper(api_key=self.api_key)

async def scrape(self, url: str, run_id: str) -> ScrapeResult:
try:
result = await self.scraper.scrape_async(
url=url,
output_format="markdown",
render_javascript=False,
)

# Extract content from result.data
content: Optional[str] = None

if isinstance(result.data, str):
content = result.data
elif isinstance(result.data, dict):
# Fallback if it returns a dict
content = result.data.get("content") or result.data.get("markdown") or str(result.data)

content_size = len(content.encode("utf-8")) if content else 0

return ScrapeResult(
run_id=run_id,
scraper="oxylabs_ai_scraper",
url=url,
status_code=200, # Assumed success if no exception raised
error=result.message if result.message and not content else None,
content_size=content_size,
format="markdown",
created_at=datetime.now().isoformat(),
content=content,
)

except Exception as e:
return ScrapeResult(
run_id=run_id,
scraper="oxylabs_ai_scraper",
url=url,
status_code=500,
error=str(e),
content_size=0,
format="markdown",
created_at=datetime.now().isoformat(),
content=None,
)
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ selenium
exa-py
tavily-python
plotly
nbformat
nbformat
oxylabs-ai-studio
6 changes: 6 additions & 0 deletions runs/results/oxylabs_ai_scraper_quality.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"success_rate": 0.809,
"avg_recall": 0.6084242261193343,
"avg_precision": 0.6165712309014437,
"avg_f1": 0.6076015997608042
}