ScrapeGraphAI · VinciGit00 · Apr 14, 2025 · Apr 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -192,3 +192,6 @@ cython_debug/
 .DS_Store
 
 dev.ipynb
+
+# CodeBeaver reports and artifacts
+.codebeaver
diff --git a/examples/ScrapegraphAI_cookbook.ipynb b/examples/ScrapegraphAI_cookbook.ipynb
diff --git a/examples/code_generator_graph/ollama/code_generator_graph_ollama.py b/examples/code_generator_graph/ollama/code_generator_graph_ollama.py
@@ -2,7 +2,6 @@
 Basic example of scraping pipeline using Code Generator with schema
 """
 
-import json
 from typing import List
 
 from dotenv import load_dotenv

diff --git a/examples/custom_graph/ollama/custom_graph_ollama.py b/examples/custom_graph/ollama/custom_graph_ollama.py
@@ -2,16 +2,13 @@
 Example of custom graph using existing nodes
 """
 
-import os
-
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 
 from scrapegraphai.graphs import BaseGraph
 from scrapegraphai.nodes import (
     FetchNode,
     GenerateAnswerNode,
     ParseNode,
-    RAGNode,
     RobotsNode,
 )
 

diff --git a/examples/extras/chromium_selenium.py b/examples/extras/chromium_selenium.py
@@ -9,7 +9,6 @@
     ChromiumLoader,
 )
 from scrapegraphai.graphs import SmartScraperGraph
-from scrapegraphai.utils import prettify_exec_info
 
 # Load environment variables for API keys
 load_dotenv()

diff --git a/examples/extras/no_cut.py b/examples/extras/no_cut.py
@@ -3,7 +3,6 @@
 """
 
 import json
-import os
 
 from scrapegraphai.graphs import SmartScraperGraph
 from scrapegraphai.utils import prettify_exec_info

diff --git a/examples/extras/serch_graph_scehma.py b/examples/extras/serch_graph_scehma.py
@@ -40,7 +40,7 @@ class Ceos(BaseModel):
 # ************************************************
 
 search_graph = SearchGraph(
-    prompt=f"Who is the ceo of Appke?",
+    prompt="Who is the ceo of Appke?",
     schema=Ceos,
     config=graph_config,
 )

diff --git a/examples/script_generator_graph/ollama/script_multi_generator_ollama.py b/examples/script_generator_graph/ollama/script_multi_generator_ollama.py
@@ -2,8 +2,6 @@
 Basic example of scraping pipeline using ScriptCreatorGraph
 """
 
-import os
-
 from dotenv import load_dotenv
 
 from scrapegraphai.graphs import ScriptCreatorMultiGraph

diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py
@@ -113,9 +113,7 @@ def _create_extraction_chain(self):
         {nodes_description}
 
         Based on the user's input: "{input}", identify the essential nodes required for the task and suggest a graph configuration that outlines the flow between the chosen nodes.
-        """.format(
-            nodes_description=self.nodes_description, input="{input}"
-        )
+        """.format(nodes_description=self.nodes_description, input="{input}")
         extraction_prompt = ChatPromptTemplate.from_template(
             create_graph_prompt_template
         )

diff --git a/scrapegraphai/docloaders/scrape_do.py b/scrapegraphai/docloaders/scrape_do.py
@@ -2,10 +2,10 @@
 Scrape_do module
 """
 
+import os
 import urllib.parse
 
 import requests
-import os
 import urllib3
 
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -177,7 +177,7 @@ def _create_llm(self, llm_config: dict) -> object:
             ]
             if len(possible_providers) <= 0:
                 raise ValueError(
-                    f"""Provider {llm_params['model_provider']} is not supported.
+                    f"""Provider {llm_params["model_provider"]} is not supported.
                                 If possible, try to use a model instance instead."""
                 )
             llm_params["model_provider"] = possible_providers[0]
@@ -190,7 +190,7 @@ def _create_llm(self, llm_config: dict) -> object:
 
         if llm_params["model_provider"] not in known_providers:
             raise ValueError(
-                f"""Provider {llm_params['model_provider']} is not supported.
+                f"""Provider {llm_params["model_provider"]} is not supported.
                              If possible, try to use a model instance instead."""
             )
 
@@ -201,7 +201,7 @@ def _create_llm(self, llm_config: dict) -> object:
                 ]
             except KeyError:
                 print(
-                    f"""Max input tokens for model {llm_params['model_provider']}/{llm_params['model']} not found,
+                    f"""Max input tokens for model {llm_params["model_provider"]}/{llm_params["model"]} not found,
                     please specify the model_tokens parameter in the llm section of the graph configuration.
                     Using default token size: 8192"""
                 )

diff --git a/scrapegraphai/graphs/csv_scraper_multi_graph.py b/scrapegraphai/graphs/csv_scraper_multi_graph.py
@@ -49,7 +49,6 @@ def __init__(
         config: dict,
         schema: Optional[Type[BaseModel]] = None,
     ):
-
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)
 

diff --git a/scrapegraphai/graphs/json_scraper_multi_graph.py b/scrapegraphai/graphs/json_scraper_multi_graph.py
@@ -49,7 +49,6 @@ def __init__(
         config: dict,
         schema: Optional[Type[BaseModel]] = None,
     ):
-
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)
 

diff --git a/scrapegraphai/graphs/omni_search_graph.py b/scrapegraphai/graphs/omni_search_graph.py
@@ -44,7 +44,6 @@ class OmniSearchGraph(AbstractGraph):
     def __init__(
         self, prompt: str, config: dict, schema: Optional[Type[BaseModel]] = None
     ):
-
         self.max_results = config.get("max_results", 3)
 
         self.copy_config = safe_deepcopy(config)

diff --git a/scrapegraphai/graphs/script_creator_multi_graph.py b/scrapegraphai/graphs/script_creator_multi_graph.py
@@ -48,7 +48,6 @@ def __init__(
         config: dict,
         schema: Optional[Type[BaseModel]] = None,
     ):
-
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)
         super().__init__(prompt, config, source, schema)

diff --git a/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py b/scrapegraphai/graphs/smart_scraper_multi_concat_graph.py
@@ -53,7 +53,6 @@ def __init__(
         config: dict,
         schema: Optional[Type[BaseModel]] = None,
     ):
-
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)
 

diff --git a/scrapegraphai/graphs/smart_scraper_multi_graph.py b/scrapegraphai/graphs/smart_scraper_multi_graph.py
@@ -55,7 +55,6 @@ def __init__(
         config: dict,
         schema: Optional[Type[BaseModel]] = None,
     ):
-
         self.max_results = config.get("max_results", 3)
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)

diff --git a/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py b/scrapegraphai/graphs/smart_scraper_multi_lite_graph.py
@@ -55,7 +55,6 @@ def __init__(
         config: dict,
         schema: Optional[Type[BaseModel]] = None,
     ):
-
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)
         super().__init__(prompt, config, source, schema)

diff --git a/scrapegraphai/graphs/xml_scraper_multi_graph.py b/scrapegraphai/graphs/xml_scraper_multi_graph.py
@@ -49,7 +49,6 @@ def __init__(
         config: dict,
         schema: Optional[Type[BaseModel]] = None,
     ):
-
         self.copy_config = safe_deepcopy(config)
         self.copy_schema = deepcopy(schema)
         super().__init__(prompt, config, source, schema)

diff --git a/scrapegraphai/models/openai_tts.py b/scrapegraphai/models/openai_tts.py
@@ -19,7 +19,6 @@ class OpenAITextToSpeech:
     """
 
     def __init__(self, tts_config: dict):
-
         self.client = OpenAI(
             api_key=tts_config.get("api_key"), base_url=tts_config.get("base_url", None)
         )

diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py
@@ -54,7 +54,6 @@ def __init__(
         min_input_len: int = 1,
         node_config: Optional[dict] = None,
     ):
-
         self.node_name = node_name
         self.input = input
         self.output = output
@@ -197,7 +196,6 @@ def evaluate_simple_expression(exp: str) -> List[str]:
             """Evaluate an expression without parentheses."""
 
             for or_segment in exp.split("|"):
-
                 and_segment = or_segment.split("&")
                 if all(elem.strip() in state for elem in and_segment):
                     return [
@@ -226,7 +224,7 @@ def evaluate_expression(expression: str) -> List[str]:
             raise ValueError(
                 f"""No state keys matched the expression.
                              Expression was {expression}.
-                             State contains keys: {', '.join(state.keys())}"""
+                             State contains keys: {", ".join(state.keys())}"""
             )
 
         final_result = []

diff --git a/scrapegraphai/nodes/concat_answers_node.py b/scrapegraphai/nodes/concat_answers_node.py
@@ -36,8 +36,7 @@ def __init__(
         )
 
     def _merge_dict(self, items):
-
-        return {"products": {f"item_{i+1}": item for i, item in enumerate(items)}}
+        return {"products": {f"item_{i + 1}": item for i, item in enumerate(items)}}
 
     def execute(self, state: dict) -> dict:
         """

diff --git a/scrapegraphai/nodes/description_node.py b/scrapegraphai/nodes/description_node.py
@@ -58,7 +58,7 @@ def execute(self, state: dict) -> dict:
                 template=DESCRIPTION_NODE_PROMPT,
                 partial_variables={"content": chunk.get("document")},
             )
-            chain_name = f"chunk{i+1}"
+            chain_name = f"chunk{i + 1}"
             chains_dict[chain_name] = prompt | self.llm_model
 
         async_runner = RunnableParallel(**chains_dict)

diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py
@@ -96,7 +96,6 @@ def execute(self, state):
         doc = input_data[1]
 
         if self.node_config.get("schema", None) is not None:
-
             if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
                 self.llm_model = self.llm_model.with_structured_output(
                     schema=self.node_config["schema"]
@@ -151,7 +150,7 @@ def execute(self, state):
                 },
             )
 
-            chain_name = f"chunk{i+1}"
+            chain_name = f"chunk{i + 1}"
             chains_dict[chain_name] = prompt | self.llm_model | output_parser
 
         async_runner = RunnableParallel(**chains_dict)

diff --git a/scrapegraphai/nodes/generate_answer_from_image_node.py b/scrapegraphai/nodes/generate_answer_from_image_node.py
@@ -85,7 +85,7 @@ async def execute_async(self, state: dict) -> dict:
             raise ValueError(
                 f"""The model provided
                              is not supported. Supported models are:
-                             {', '.join(supported_models)}."""
+                             {", ".join(supported_models)}."""
             )
 
         api_key = self.node_config.get("config", {}).get("llm", {}).get("api_key", "")

diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -221,7 +221,7 @@ def execute(self, state: dict) -> dict:
                     "format_instructions": format_instructions,
                 },
             )
-            chain_name = f"chunk{i+1}"
+            chain_name = f"chunk{i + 1}"
             chains_dict[chain_name] = prompt | self.llm_model
             if output_parser:
                 chains_dict[chain_name] = chains_dict[chain_name] | output_parser

diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -155,7 +155,7 @@ def execute(self, state: dict) -> dict:
                     "chunk_id": i + 1,
                 },
             )
-            chain_name = f"chunk{i+1}"
+            chain_name = f"chunk{i + 1}"
             chains_dict[chain_name] = prompt | self.llm_model
 
         async_runner = RunnableParallel(**chains_dict)

diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py
@@ -89,7 +89,6 @@ def execute(self, state: dict) -> dict:
         imag_desc = input_data[2]
 
         if self.node_config.get("schema", None) is not None:
-
             if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
                 self.llm_model = self.llm_model.with_structured_output(
                     schema=self.node_config["schema"]
@@ -151,7 +150,7 @@ def execute(self, state: dict) -> dict:
                 },
             )
 
-            chain_name = f"chunk{i+1}"
+            chain_name = f"chunk{i + 1}"
             chains_dict[chain_name] = prompt | self.llm_model | output_parser
 
         async_runner = RunnableParallel(**chains_dict)

diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py
@@ -82,10 +82,9 @@ def execute(self, state: dict) -> dict:
 
         answers_str = ""
         for i, answer in enumerate(answers):
-            answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n"
+            answers_str += f"CONTENT WEBSITE {i + 1}: {answer}\n"
 
         if self.node_config.get("schema", None) is not None:
-
             if isinstance(self.llm_model, (ChatOpenAI, ChatMistralAI)):
                 self.llm_model = self.llm_model.with_structured_output(
                     schema=self.node_config["schema"]

diff --git a/scrapegraphai/nodes/merge_generated_scripts_node.py b/scrapegraphai/nodes/merge_generated_scripts_node.py
@@ -64,7 +64,7 @@ def execute(self, state: dict) -> dict:
         scripts_str = ""
         for i, script in enumerate(scripts):
             scripts_str += "-----------------------------------\n"
-            scripts_str += f"SCRIPT URL {i+1}\n"
+            scripts_str += f"SCRIPT URL {i + 1}\n"
             scripts_str += "-----------------------------------\n"
             scripts_str += script
 

diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -122,7 +122,7 @@ def execute(self, state: dict) -> dict:
         state.update({self.output[0]: chunks})
         state.update({"parsed_doc": chunks})
         state.update({"content": chunks})
-        
+
         if self.parse_urls:
             state.update({self.output[1]: link_urls})
             state.update({self.output[2]: img_urls})

diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py
@@ -122,7 +122,6 @@ def execute(self, state: dict) -> dict:
             )
         ):
             try:
-
                 links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
 
                 if not self.filter_links:

diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -1,5 +1,5 @@
 """
-    __init__.py file for utils folder
+__init__.py file for utils folder
 """
 
 from .cleanup_code import extract_code
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,6 @@ class OpenAITextToSpeech: @@
         """
         def __init__(self, tts_config: dict):
             self.client = OpenAI(
                 api_key=tts_config.get("api_key"), base_url=tts_config.get("base_url", None)
             )
@@ Expand Down @@