diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 0c362a8..8431e3e 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -6,15 +6,11 @@ name: Upload Python Package on: release: types: [published] - paths: - - 'scrapegraph-py/**' jobs: deploy: runs-on: ubuntu-latest - # Only run if scrapegraph-py has changes - if: contains(github.event.release.body, 'scrapegraph-py/') steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cb2e3a6..2f7a203 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -4,15 +4,11 @@ on: branches: - main - pre/* - paths: - - 'scrapegraph-py/**' jobs: build: name: Build runs-on: ubuntu-latest - # Only run if scrapegraph-py has changes - if: contains(github.event.head_commit.modified, 'scrapegraph-py/') || contains(github.event.head_commit.added, 'scrapegraph-py/') || contains(github.event.head_commit.removed, 'scrapegraph-py/') steps: - name: Install git run: | diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md index 9ed7150..fe68c7e 100644 --- a/scrapegraph-js/README.md +++ b/scrapegraph-js/README.md @@ -35,6 +35,7 @@ yarn add scrapegraph-js ```javascript import { smartScraper } from 'scrapegraph-js'; +import 'dotenv/config'; // Initialize variables const apiKey = process.env.SGAI_APIKEY; // Set your API key as an environment variable @@ -105,12 +106,43 @@ const schema = z.object({ })(); ``` +### Scraping local HTML + +Extract structured data from local HTML content + +```javascript +import { localScraper } from 'scrapegraph-js'; + +const apiKey = 'your_api_key'; +const prompt = 'What does the company do?'; + +const websiteHtml = ` + +

Company Name

+

We are a technology company focused on AI solutions.

+
+

Email: contact@example.com

+
+ + `; +(async () => { + try { + const response = await localScraper(apiKey, websiteHtml, prompt); + console.log(response); + } catch (error) { + console.error(error); + } +})(); +``` + ### Markdownify + Converts a webpage into clean, well-structured markdown format. + ```javascript import { smartScraper } from 'scrapegraph-js'; -const apiKey = "your_api_key"; +const apiKey = 'your_api_key'; const url = 'https://scrapegraphai.com/'; (async () => { @@ -123,7 +155,6 @@ const url = 'https://scrapegraphai.com/'; })(); ``` - ### Checking API Credits ```javascript diff --git a/scrapegraph-js/examples/localScraper_example.js b/scrapegraph-js/examples/localScraper_example.js new file mode 100644 index 0000000..95552e5 --- /dev/null +++ b/scrapegraph-js/examples/localScraper_example.js @@ -0,0 +1,33 @@ +import { localScraper, getLocalScraperRequest } from 'scrapegraph-js'; +import 'dotenv/config'; + +// localScraper function example +const apiKey = process.env.SGAI_APIKEY; +const prompt = 'What does the company do?'; + +const websiteHtml = ` + +

Company Name

+

We are a technology company focused on AI solutions.

+
+

Email: contact@example.com

+
+ + `; + +try { + const response = await localScraper(apiKey, websiteHtml, prompt); + console.log(response); +} catch (error) { + console.error(error); +} + +// getLocalScraperFunctionExample +const requestId = 'b8d97545-9ed3-441b-a01f-4b661b4f0b4c'; + +try { + const response = await getLocalScraperRequest(apiKey, requestId); + console.log(response); +} catch (error) { + console.log(error); +} diff --git a/scrapegraph-js/examples/schema_localScraper_example.js b/scrapegraph-js/examples/schema_localScraper_example.js new file mode 100644 index 0000000..1de6344 --- /dev/null +++ b/scrapegraph-js/examples/schema_localScraper_example.js @@ -0,0 +1,28 @@ +import { localScraper } from 'scrapegraph-js'; +import { z } from 'zod'; +import 'dotenv/config'; + +// localScraper function example +const apiKey = process.env.SGAI_APIKEY; +const prompt = 'extract contact'; + +const websiteHtml = ` + +

Company Name

+

We are a technology company focused on AI solutions.

+
+

Email: contact@example.com

+
+ + `; + +const schema = z.object({ + contact: z.string().describe('email contact'), +}); + +try { + const response = await localScraper(apiKey, websiteHtml, prompt, schema); + console.log(response); +} catch (error) { + console.error(error); +} diff --git a/scrapegraph-js/index.js b/scrapegraph-js/index.js index ca4dbb7..1e4c1c5 100644 --- a/scrapegraph-js/index.js +++ b/scrapegraph-js/index.js @@ -1,4 +1,5 @@ export { smartScraper, getSmartScraperRequest } from './src/smartScraper.js'; export { markdownify, getMarkdownifyRequest } from './src/markdownify.js'; +export { localScraper, getLocalScraperRequest } from './src/localScraper.js'; export { getCredits } from './src/credits.js'; export { sendFeedback } from './src/feedback.js'; diff --git a/scrapegraph-js/src/localScraper.js b/scrapegraph-js/src/localScraper.js new file mode 100644 index 0000000..412aac4 --- /dev/null +++ b/scrapegraph-js/src/localScraper.js @@ -0,0 +1,66 @@ +import axios from 'axios'; +import handleError from './utils/handleError.js'; +import { ZodType } from 'zod'; +import { zodToJsonSchema } from 'zod-to-json-schema'; + +/** + * Extract structured data from local HTML content using ScrapeGraph AI. + * + * @param {string} apiKey - The API key for ScrapeGraph AI. + * @param {string} websiteHtml - HTML content as a string from the local web page to scrape. + * @param {string} prompt - A natural language description of the data to extract. + * @param {Object} [schema] - (Optional) Schema object defining the structure of the desired output. + * @returns {Promise} A JSON string containing the extracted data, formatted to match the schema. + * @throws {Error} If an HTTP error or validation issue occurs. + */ +export async function localScraper(apiKey, websiteHtml, prompt, schema = null) { + const endpoint = 'https://api.scrapegraphai.com/v1/localscraper'; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + 'Content-Type': 'application/json', + }; + + const payload = { + website_html: websiteHtml, + user_prompt: prompt, + }; + + if (schema) { + if (schema instanceof ZodType) { + payload.output_schema = zodToJsonSchema(schema); + } else { + throw new Error('The schema must be an instance of a valid Zod schema'); + } + } + + try { + const response = await axios.post(endpoint, payload, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} + +/** + * Retrieve the status or result of a localScraper request, including results of previous requests. + * + * @param {string} apiKey - The API key for ScrapeGraph AI. + * @param {string} requestId - The unique ID associated with the localScraper request. + * @returns {Promise} A JSON string containing the status or result of the scraping request. + * @throws {Error} If an error occurs while retrieving the request details. + */ +export async function getLocalScraperRequest(apiKey, requestId) { + const endpoint = 'https://api.scrapegraphai.com/v1/localscraper/' + requestId; + const headers = { + 'accept': 'application/json', + 'SGAI-APIKEY': apiKey, + }; + + try { + const response = await axios.get(endpoint, { headers }); + return response.data; + } catch (error) { + handleError(error); + } +} diff --git a/scrapegraph-js/src/markdownify.js b/scrapegraph-js/src/markdownify.js index 5a1d4e5..14ae0e3 100644 --- a/scrapegraph-js/src/markdownify.js +++ b/scrapegraph-js/src/markdownify.js @@ -9,7 +9,7 @@ import handleError from './utils/handleError.js'; * @returns {Promise} A promise that resolves to the markdown representation of the webpage. * @throws {Error} Throws an error if the HTTP request fails. */ -export async function markdownify(apiKey, url){ +export async function markdownify(apiKey, url) { const endpoint = 'https://api.scrapegraphai.com/v1/markdownify'; const headers = { 'accept': 'application/json', @@ -24,7 +24,7 @@ export async function markdownify(apiKey, url){ const response = await axios.post(endpoint, payload, { headers }); return response.data; } catch (error) { - handleError(error) + handleError(error); } } @@ -36,7 +36,7 @@ export async function markdownify(apiKey, url){ * @returns {Promise} A promise that resolves with details about the status or outcome of the specified request. * @throws {Error} Throws an error if the HTTP request fails. */ -export async function getMarkdownifyRequest(apiKey, requestId){ +export async function getMarkdownifyRequest(apiKey, requestId) { const endpoint = 'https://api.scrapegraphai.com/v1/markdownify/' + requestId; const headers = { 'accept': 'application/json', @@ -47,6 +47,6 @@ export async function getMarkdownifyRequest(apiKey, requestId){ const response = await axios.get(endpoint, { headers }); return response.data; } catch (error) { - handleError(error) + handleError(error); } -} \ No newline at end of file +} diff --git a/scrapegraph-py/CHANGELOG.md b/scrapegraph-py/CHANGELOG.md index 26cc6f1..6654cac 100644 --- a/scrapegraph-py/CHANGELOG.md +++ b/scrapegraph-py/CHANGELOG.md @@ -1,3 +1,48 @@ +## [1.9.0-beta.5](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.9.0-beta.4...v1.9.0-beta.5) (2025-01-03) + + +### Bug Fixes + +* updated hatchling version ([740933a](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/740933aff79a5873e6d1c633afcedb674d1f4cf0)) + +## [1.9.0-beta.4](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.9.0-beta.3...v1.9.0-beta.4) (2025-01-03) + + +### Bug Fixes + +* improve api desc ([62243f8](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/62243f84384ae238c0bd0c48abc76a6b99376c74)) + +## [1.9.0-beta.3](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.9.0-beta.2...v1.9.0-beta.3) (2024-12-10) + + +### Bug Fixes + +* come back to py 3.10 ([26d3a75](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/26d3a75ed973590e21d55c985bf71f3905a3ac0e)) + +## [1.9.0-beta.2](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.9.0-beta.1...v1.9.0-beta.2) (2024-12-10) + + +### Bug Fixes + +* add new python compatibility ([77b67f6](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/77b67f646d75abd3a558b40cb31c52c12cc7182e)) + +## [1.9.0-beta.1](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.8.0...v1.9.0-beta.1) (2024-12-10) + + +### Features + +* add localScraper functionality ([8701eb2](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/8701eb2ca7f108b922eb1617c850a58c0f88f8f9)) +* revert to old release ([d88a3ac](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/d88a3ac6969a0abdf1f6b8eccde9ad8284d41d20)) + + +### Bug Fixes + +* .toml file ([e719881](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/e7198817d8dac802361ab84bc4d5d961fb926767)) +* add revert ([09257e0](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/09257e08246d8aee96b3944ac14cc14b88e5f818)) +* minor fix version ([0b972c6](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/0b972c69a9ea843d8ec89327f35c287b0d7a2bb4)) +* pyproject ([2440f7f](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/2440f7f2a5179c6e3a86faf4eefa1d5edf7524c8)) +* python version ([24366b0](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/24366b08eefe0789da9a0ccafb8058e8744ee58b)) + ## [1.8.0](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.7.0...v1.8.0) (2024-12-08) diff --git a/scrapegraph-py/pyproject.toml b/scrapegraph-py/pyproject.toml index da5ef04..f885b92 100644 --- a/scrapegraph-py/pyproject.toml +++ b/scrapegraph-py/pyproject.toml @@ -92,7 +92,7 @@ disallow_untyped_calls = true ignore_missing_imports = true [build-system] -requires = ["hatchling"] +requires = ["hatchling==1.26.3"] build-backend = "hatchling.build" [tool.poe.tasks] diff --git a/scrapegraph-py/scrapegraph_py/utils/helpers.py b/scrapegraph-py/scrapegraph_py/utils/helpers.py index b5e3c28..7e5d7d4 100644 --- a/scrapegraph-py/scrapegraph_py/utils/helpers.py +++ b/scrapegraph-py/scrapegraph_py/utils/helpers.py @@ -17,7 +17,7 @@ def validate_api_key(api_key: str) -> bool: UUID(uuid_part) except ValueError: raise ValueError( - "Invalid API key format. API key must be 'sgai-' followed by a valid UUID." + "Invalid API key format. API key must be 'sgai-' followed by a valid UUID. You can get one at https://dashboard.scrapegraphai.com/" ) return True diff --git a/scrapegraph-py/uv.lock b/scrapegraph-py/uv.lock index 1990785..bb5cf94 100644 --- a/scrapegraph-py/uv.lock +++ b/scrapegraph-py/uv.lock @@ -557,11 +557,11 @@ wheels = [ [[package]] name = "idna" -version = "3.10" +version = "3.9" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } +sdist = { url = "https://files.pythonhosted.org/packages/00/6f/93e724eafe34e860d15d37a4f72a1511dd37c43a76a8671b22a15029d545/idna-3.9.tar.gz", hash = "sha256:e5c5dafde284f26e9e0f28f6ea2d6400abd5ca099864a67f576f3981c6476124", size = 191636 } wheels = [ - { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, + { url = "https://files.pythonhosted.org/packages/6d/15/61933d1999bc5ad8cad612d67f02fa5b16a423076ea0816e39c2e797af12/idna-3.9-py3-none-any.whl", hash = "sha256:69297d5da0cc9281c77efffb4e730254dd45943f45bbfb461de5991713989b1e", size = 71671 }, ] [[package]]