Skip to content

Pre/beta - Unit Tests #964

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,6 @@ cython_debug/
.DS_Store

dev.ipynb

# CodeBeaver reports and artifacts
.codebeaver
1,805 changes: 902 additions & 903 deletions examples/ScrapegraphAI_cookbook.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
Basic example of scraping pipeline using Code Generator with schema
"""

import json
from typing import List

from dotenv import load_dotenv
Expand Down
3 changes: 0 additions & 3 deletions examples/custom_graph/ollama/custom_graph_ollama.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@
Example of custom graph using existing nodes
"""

import os

from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from scrapegraphai.graphs import BaseGraph
from scrapegraphai.nodes import (
FetchNode,
GenerateAnswerNode,
ParseNode,
RAGNode,
RobotsNode,
)

Expand Down
1 change: 0 additions & 1 deletion examples/extras/chromium_selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
ChromiumLoader,
)
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info

# Load environment variables for API keys
load_dotenv()
Expand Down
1 change: 0 additions & 1 deletion examples/extras/no_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
"""

import json
import os

from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.utils import prettify_exec_info
Expand Down
2 changes: 1 addition & 1 deletion examples/extras/serch_graph_scehma.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Ceos(BaseModel):
# ************************************************

search_graph = SearchGraph(
prompt=f"Who is the ceo of Appke?",
prompt="Who is the ceo of Appke?",
schema=Ceos,
config=graph_config,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
Basic example of scraping pipeline using ScriptCreatorGraph
"""

import os

from dotenv import load_dotenv

from scrapegraphai.graphs import ScriptCreatorMultiGraph
Expand Down
4 changes: 1 addition & 3 deletions scrapegraphai/builders/graph_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,7 @@ def _create_extraction_chain(self):
{nodes_description}

Based on the user's input: "{input}", identify the essential nodes required for the task and suggest a graph configuration that outlines the flow between the chosen nodes.
""".format(
nodes_description=self.nodes_description, input="{input}"
)
""".format(nodes_description=self.nodes_description, input="{input}")
extraction_prompt = ChatPromptTemplate.from_template(
create_graph_prompt_template
)
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/docloaders/scrape_do.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
Scrape_do module
"""

import os
import urllib.parse

import requests
import os
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Expand Down
6 changes: 3 additions & 3 deletions scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def _create_llm(self, llm_config: dict) -> object:
]
if len(possible_providers) <= 0:
raise ValueError(
f"""Provider {llm_params['model_provider']} is not supported.
f"""Provider {llm_params["model_provider"]} is not supported.
If possible, try to use a model instance instead."""
)
llm_params["model_provider"] = possible_providers[0]
Expand All @@ -190,7 +190,7 @@ def _create_llm(self, llm_config: dict) -> object:

if llm_params["model_provider"] not in known_providers:
raise ValueError(
f"""Provider {llm_params['model_provider']} is not supported.
f"""Provider {llm_params["model_provider"]} is not supported.
If possible, try to use a model instance instead."""
)

Expand All @@ -201,7 +201,7 @@ def _create_llm(self, llm_config: dict) -> object:
]
except KeyError:
print(
f"""Max input tokens for model {llm_params['model_provider']}/{llm_params['model']} not found,
f"""Max input tokens for model {llm_params["model_provider"]}/{llm_params["model"]} not found,
please specify the model_tokens parameter in the llm section of the graph configuration.
Using default token size: 8192"""
)
Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/csv_scraper_multi_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(
config: dict,
schema: Optional[Type[BaseModel]] = None,
):

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)

Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/json_scraper_multi_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(
config: dict,
schema: Optional[Type[BaseModel]] = None,
):

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)

Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/omni_search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ class OmniSearchGraph(AbstractGraph):
def __init__(
self, prompt: str, config: dict, schema: Optional[Type[BaseModel]] = None
):

self.max_results = config.get("max_results", 3)

self.copy_config = safe_deepcopy(config)
Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/script_creator_multi_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def __init__(
config: dict,
schema: Optional[Type[BaseModel]] = None,
):

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)
super().__init__(prompt, config, source, schema)
Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def __init__(
config: dict,
schema: Optional[Type[BaseModel]] = None,
):

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)

Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/smart_scraper_multi_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def __init__(
config: dict,
schema: Optional[Type[BaseModel]] = None,
):

self.max_results = config.get("max_results", 3)
self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)
Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/smart_scraper_multi_lite_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def __init__(
config: dict,
schema: Optional[Type[BaseModel]] = None,
):

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)
super().__init__(prompt, config, source, schema)
Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/graphs/xml_scraper_multi_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def __init__(
config: dict,
schema: Optional[Type[BaseModel]] = None,
):

self.copy_config = safe_deepcopy(config)
self.copy_schema = deepcopy(schema)
super().__init__(prompt, config, source, schema)
Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/models/openai_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ class OpenAITextToSpeech:
"""

def __init__(self, tts_config: dict):

self.client = OpenAI(
api_key=tts_config.get("api_key"), base_url=tts_config.get("base_url", None)
)
Expand Down
4 changes: 1 addition & 3 deletions scrapegraphai/nodes/base_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def __init__(
min_input_len: int = 1,
node_config: Optional[dict] = None,
):

self.node_name = node_name
self.input = input
self.output = output
Expand Down Expand Up @@ -197,7 +196,6 @@ def evaluate_simple_expression(exp: str) -> List[str]:
"""Evaluate an expression without parentheses."""

for or_segment in exp.split("|"):

and_segment = or_segment.split("&")
if all(elem.strip() in state for elem in and_segment):
return [
Expand Down Expand Up @@ -226,7 +224,7 @@ def evaluate_expression(expression: str) -> List[str]:
raise ValueError(
f"""No state keys matched the expression.
Expression was {expression}.
State contains keys: {', '.join(state.keys())}"""
State contains keys: {", ".join(state.keys())}"""
)

final_result = []
Expand Down
3 changes: 1 addition & 2 deletions scrapegraphai/nodes/concat_answers_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ def __init__(
)

def _merge_dict(self, items):

return {"products": {f"item_{i+1}": item for i, item in enumerate(items)}}
return {"products": {f"item_{i + 1}": item for i, item in enumerate(items)}}

def execute(self, state: dict) -> dict:
"""
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/description_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def execute(self, state: dict) -> dict:
template=DESCRIPTION_NODE_PROMPT,
partial_variables={"content": chunk.get("document")},
)
chain_name = f"chunk{i+1}"
chain_name = f"chunk{i + 1}"
chains_dict[chain_name] = prompt | self.llm_model

async_runner = RunnableParallel(**chains_dict)
Expand Down
3 changes: 1 addition & 2 deletions scrapegraphai/nodes/generate_answer_csv_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def execute(self, state):
doc = input_data[1]

if self.node_config.get("schema", None) is not None:

if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
self.llm_model = self.llm_model.with_structured_output(
schema=self.node_config["schema"]
Expand Down Expand Up @@ -151,7 +150,7 @@ def execute(self, state):
},
)

chain_name = f"chunk{i+1}"
chain_name = f"chunk{i + 1}"
chains_dict[chain_name] = prompt | self.llm_model | output_parser

async_runner = RunnableParallel(**chains_dict)
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/generate_answer_from_image_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ async def execute_async(self, state: dict) -> dict:
raise ValueError(
f"""The model provided
is not supported. Supported models are:
{', '.join(supported_models)}."""
{", ".join(supported_models)}."""
)

api_key = self.node_config.get("config", {}).get("llm", {}).get("api_key", "")
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/generate_answer_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def execute(self, state: dict) -> dict:
"format_instructions": format_instructions,
},
)
chain_name = f"chunk{i+1}"
chain_name = f"chunk{i + 1}"
chains_dict[chain_name] = prompt | self.llm_model
if output_parser:
chains_dict[chain_name] = chains_dict[chain_name] | output_parser
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/generate_answer_node_k_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def execute(self, state: dict) -> dict:
"chunk_id": i + 1,
},
)
chain_name = f"chunk{i+1}"
chain_name = f"chunk{i + 1}"
chains_dict[chain_name] = prompt | self.llm_model

async_runner = RunnableParallel(**chains_dict)
Expand Down
3 changes: 1 addition & 2 deletions scrapegraphai/nodes/generate_answer_omni_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@ def execute(self, state: dict) -> dict:
imag_desc = input_data[2]

if self.node_config.get("schema", None) is not None:

if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
self.llm_model = self.llm_model.with_structured_output(
schema=self.node_config["schema"]
Expand Down Expand Up @@ -151,7 +150,7 @@ def execute(self, state: dict) -> dict:
},
)

chain_name = f"chunk{i+1}"
chain_name = f"chunk{i + 1}"
chains_dict[chain_name] = prompt | self.llm_model | output_parser

async_runner = RunnableParallel(**chains_dict)
Expand Down
3 changes: 1 addition & 2 deletions scrapegraphai/nodes/merge_answers_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,9 @@ def execute(self, state: dict) -> dict:

answers_str = ""
for i, answer in enumerate(answers):
answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n"
answers_str += f"CONTENT WEBSITE {i + 1}: {answer}\n"

if self.node_config.get("schema", None) is not None:

if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
self.llm_model = self.llm_model.with_structured_output(
schema=self.node_config["schema"]
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/merge_generated_scripts_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def execute(self, state: dict) -> dict:
scripts_str = ""
for i, script in enumerate(scripts):
scripts_str += "-----------------------------------\n"
scripts_str += f"SCRIPT URL {i+1}\n"
scripts_str += f"SCRIPT URL {i + 1}\n"
scripts_str += "-----------------------------------\n"
scripts_str += script

Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/parse_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def execute(self, state: dict) -> dict:
state.update({self.output[0]: chunks})
state.update({"parsed_doc": chunks})
state.update({"content": chunks})

if self.parse_urls:
state.update({self.output[1]: link_urls})
state.update({self.output[2]: img_urls})
Expand Down
1 change: 0 additions & 1 deletion scrapegraphai/nodes/search_link_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ def execute(self, state: dict) -> dict:
)
):
try:

links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))

if not self.filter_links:
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
__init__.py file for utils folder
__init__.py file for utils folder
"""

from .cleanup_code import extract_code
Expand Down
Loading
Loading