diff --git a/termscrape/.gitignore b/termscrape/.gitignore new file mode 100644 index 0000000..2f99b40 --- /dev/null +++ b/termscrape/.gitignore @@ -0,0 +1,98 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Virtual environments +venv/ +env/ +ENV/ +env.bak/ +venv.bak/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Playwright +.playwright/ + +# Scraped data +scraped/ +*.scraped/ +output/ +downloads/ + +# Logs +*.log +logs/ + +# Environment variables +.env +.env.local + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Project specific +*.md.bak +*.json.bak +temp/ +tmp/ diff --git a/termscrape/LICENSE b/termscrape/LICENSE new file mode 100644 index 0000000..2a9f4c7 --- /dev/null +++ b/termscrape/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 TermScrape Team + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/termscrape/README.md b/termscrape/README.md new file mode 100644 index 0000000..6b13b15 --- /dev/null +++ b/termscrape/README.md @@ -0,0 +1,413 @@ +# TermScrape + +**TermScrape** is a comprehensive, terminal-based web scraping tool inspired by Firecrawl. It runs entirely locally, handles JavaScript-heavy sites, supports recursive crawling, and integrates with local LLMs for intelligent data extraction. + +## Features + +- **Single URL Scraping**: Fetch and parse any webpage +- **Recursive Crawling**: Crawl entire websites with depth control +- **Multiple Output Formats**: Markdown, JSON, or plain text +- **JavaScript Support**: Headless browser rendering via Playwright +- **LLM Integration**: Extract structured data using local Ollama models +- **Ethical Scraping**: Respects robots.txt and includes rate limiting +- **CLI Interface**: Easy-to-use command-line interface with Click +- **Modular Design**: Clean, extensible Python codebase + +## Installation + +### Prerequisites + +- Python 3.10 or higher +- pip package manager + +### Quick Start + +```bash +# Clone the repository +git clone https://github.com/yourusername/termscrape +cd termscrape + +# Install dependencies +pip install -r requirements.txt + +# Install Playwright browsers (for JavaScript support) +playwright install +``` + +### Optional: LLM Support + +For LLM-based extraction, install Ollama: + +```bash +# Install Ollama from https://ollama.ai + +# Pull a model +ollama pull llama3 + +# Install Python package +pip install ollama +``` + +## Usage + +### Scrape a Single URL + +```bash +# Basic markdown scrape +python -m src.main scrape --url https://example.com + +# With JavaScript rendering +python -m src.main scrape --url https://example.com --js + +# Output to JSON +python -m src.main scrape --url https://example.com --format json --output page.json + +# Extract data with LLM +python -m src.main scrape --url https://news.example.com \ + --llm-prompt "Extract article title, author, and date" +``` + +### Crawl a Website + +```bash +# Crawl with depth 3 +python -m src.main crawl --url https://example.com --depth 3 + +# Save to directory +python -m src.main crawl --url https://example.com --output ./scraped/ + +# Exclude patterns +python -m src.main crawl --url https://example.com \ + --exclude .pdf --exclude /login + +# Crawl with LLM extraction +python -m src.main crawl --url https://products.example.com \ + --depth 2 \ + --llm-prompt "Extract product names and prices" \ + --output products.json +``` + +### Command Reference + +#### `scrape` Command + +| Option | Description | Default | +|--------|-------------|---------| +| `--url` | URL to scrape (required) | - | +| `--format` | Output format: `markdown`, `json`, `text` | `markdown` | +| `--js` | Enable JavaScript rendering | `False` | +| `--output`, `-o` | Output file path | stdout | +| `--user-agent` | Custom User-Agent string | `TermScrape/1.0` | +| `--llm-prompt` | LLM extraction prompt | - | +| `--llm-model` | LLM model name | `llama3` | +| `--verbose` | Enable debug logging | `False` | + +#### `crawl` Command + +| Option | Description | Default | +|--------|-------------|---------| +| `--url` | Starting URL (required) | - | +| `--depth` | Maximum crawl depth | `1` | +| `--format` | Output format per page | `markdown` | +| `--js` | Enable JavaScript rendering | `False` | +| `--output`, `-o` | Output directory or file | stdout | +| `--user-agent` | Custom User-Agent string | `TermScrape/1.0` | +| `--exclude` | URL patterns to exclude (repeatable) | - | +| `--llm-prompt` | LLM extraction prompt | - | +| `--llm-model` | LLM model name | `llama3` | +| `--verbose` | Enable debug logging | `False` | + +## Examples + +### Example 1: Documentation Scraping + +Scrape documentation and convert to markdown: + +```bash +python -m src.main scrape \ + --url https://docs.python.org/3/ \ + --format markdown \ + --output python_docs.md +``` + +### Example 2: Blog Archive + +Archive an entire blog: + +```bash +python -m src.main crawl \ + --url https://blog.example.com \ + --depth 2 \ + --format markdown \ + --exclude /tag/ --exclude /category/ \ + --output ./blog_archive/ +``` + +### Example 3: E-commerce Product Data + +Extract product information with LLM: + +```bash +python -m src.main crawl \ + --url https://shop.example.com/products \ + --depth 1 \ + --js \ + --llm-prompt "Extract: product name, price, description, and stock status" \ + --output products.json +``` + +### Example 4: Research Papers + +Scrape academic papers: + +```bash +python -m src.main scrape \ + --url https://arxiv.org/abs/2301.12345 \ + --format markdown \ + --output paper.md +``` + +## Programmatic Usage + +TermScrape can also be used as a Python library: + +```python +from src import scrape_url, crawl_site + +# Scrape single URL +content = scrape_url( + url="https://example.com", + output_format="markdown", + use_js=False +) +print(content) + +# Crawl website +results = crawl_site( + start_url="https://example.com", + max_depth=2, + output_format="json", + exclude_patterns=[".pdf", "/login"] +) + +for url, data in results.items(): + if "content" in data: + print(f"{url}: {len(data['content'])} chars") +``` + +### Advanced Usage + +```python +from src.scraper import Scraper +from src.parser import HTMLParser +from src.llm_extract import extract_with_llm + +# Custom scraper configuration +scraper = Scraper( + user_agent="MyBot/1.0", + timeout=60, + respect_robots=True +) + +# Scrape and parse +html = scraper._fetch_static("https://example.com") +parser = HTMLParser(html, url="https://example.com") + +# Get structured data +markdown = parser.to_markdown() +json_data = parser.to_json(structured=True) + +# LLM extraction +extracted = extract_with_llm( + content=markdown, + prompt="Extract all headings and their summaries", + model="llama3" +) +``` + +## Project Structure + +``` +termscrape/ +├── src/ +│ ├── __init__.py # Package initialization +│ ├── main.py # CLI entry point +│ ├── scraper.py # Single URL scraping +│ ├── crawler.py # Recursive crawling +│ ├── parser.py # HTML parsing and formatting +│ ├── browser.py # Playwright browser automation +│ ├── llm_extract.py # LLM-based extraction +│ └── utils.py # Utility functions +├── tests/ +│ ├── test_scraper.py # Scraper tests +│ └── test_crawler.py # Crawler tests +├── docs/ +│ ├── usage.md # Detailed usage guide +│ └── contributing.md # Contribution guidelines +├── examples/ +│ └── simple_scrape.py # Example scripts +├── requirements.txt # Python dependencies +├── pyproject.toml # Project configuration +└── README.md # This file +``` + +## Ethical Guidelines + +TermScrape is designed for ethical web scraping: + +- ✅ **Respects robots.txt**: Automatically checks and honors robots.txt +- ✅ **Rate limiting**: Random delays (1-5s) between requests +- ✅ **Proper identification**: Clear User-Agent string +- ✅ **No aggressive crawling**: Configurable depth limits + +**Please scrape responsibly:** + +- Don't overwhelm servers with requests +- Respect website terms of service +- Don't scrape personal data without permission +- Use scraped data ethically and legally + +## Technical Details + +### Dependencies + +- **click**: CLI interface +- **requests**: HTTP requests +- **beautifulsoup4**: HTML parsing +- **markdownify**: HTML to Markdown conversion +- **playwright**: JavaScript rendering +- **ollama** (optional): LLM integration + +### Python Version + +Requires Python 3.10 or higher. + +### Performance + +- Static pages: < 1 second per page +- JavaScript pages: 2-5 seconds per page (browser overhead) +- Crawling: Depends on depth and site structure + +### Limitations + +- No built-in proxy rotation (can be added) +- Context window limits for LLM extraction +- Playwright requires browser installation (~200MB) + +## Testing + +Run tests with pytest: + +```bash +# Run all tests +pytest + +# With coverage +pytest --cov=src --cov-report=term-missing + +# Specific test file +pytest tests/test_scraper.py +``` + +## Development + +### Setting Up Dev Environment + +```bash +# Create virtual environment +python -m venv venv +source venv/bin/activate + +# Install dev dependencies +pip install -r requirements.txt +pip install pytest pytest-cov black flake8 mypy + +# Format code +black src/ tests/ + +# Lint code +flake8 src/ tests/ + +# Type check +mypy src/ +``` + +See [docs/contributing.md](docs/contributing.md) for detailed contribution guidelines. + +## Troubleshooting + +### Playwright Issues + +If Playwright fails to install: + +```bash +playwright install chromium +``` + +### Ollama Connection + +Ensure Ollama is running: + +```bash +ollama serve +``` + +### Import Errors + +Make sure you're in the project directory: + +```bash +cd termscrape +python -m src.main --help +``` + +## Comparison with Alternatives + +| Feature | TermScrape | Crawl4AI | Scrapy | BeautifulSoup | +|---------|-----------|----------|---------|---------------| +| CLI Interface | ✅ | ✅ | ✅ | ❌ | +| JavaScript Support | ✅ | ✅ | ⚠️ (plugin) | ❌ | +| LLM Integration | ✅ | ✅ | ❌ | ❌ | +| Crawling | ✅ | ✅ | ✅ | ❌ | +| Markdown Output | ✅ | ✅ | ⚠️ (custom) | ❌ | +| Learning Curve | Easy | Medium | Steep | Easy | +| Local/Offline | ✅ | ✅ | ✅ | ✅ | + +## Roadmap + +- [ ] PDF extraction support +- [ ] Proxy rotation +- [ ] API mode (REST server) +- [ ] Distributed crawling +- [ ] Browser fingerprint randomization +- [ ] Screenshot capture +- [ ] CAPTCHA detection + +## License + +MIT License - see [LICENSE](LICENSE) file for details. + +## Contributing + +Contributions are welcome! Please read [docs/contributing.md](docs/contributing.md) before submitting PRs. + +## Acknowledgments + +Inspired by: +- [Firecrawl](https://github.com/mendableai/firecrawl) +- [Crawl4AI](https://github.com/unclecode/crawl4ai) +- [ScrapeGraphAI](https://github.com/ScrapeGraphAI/Scrapegraph-ai) + +## Support + +- **Issues**: [GitHub Issues](https://github.com/yourusername/termscrape/issues) +- **Discussions**: [GitHub Discussions](https://github.com/yourusername/termscrape/discussions) +- **Email**: contact@termscrape.dev + +## Authors + +Created by the TermScrape Team + +--- + +**Star this repo if you find it useful!** ⭐ diff --git a/termscrape/docs/contributing.md b/termscrape/docs/contributing.md new file mode 100644 index 0000000..3017f28 --- /dev/null +++ b/termscrape/docs/contributing.md @@ -0,0 +1,408 @@ +# Contributing to TermScrape + +Thank you for your interest in contributing to TermScrape! This document provides guidelines and instructions for contributing. + +## Code of Conduct + +- Be respectful and inclusive +- Focus on constructive feedback +- Help others learn and grow +- Follow ethical scraping practices + +## Getting Started + +### 1. Fork and Clone + +```bash +git clone https://github.com/yourusername/termscrape +cd termscrape +``` + +### 2. Set Up Development Environment + +```bash +# Create virtual environment +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt + +# Install Playwright +playwright install + +# Install development dependencies +pip install pytest pytest-cov black flake8 mypy +``` + +### 3. Create a Branch + +```bash +git checkout -b feature/your-feature-name +``` + +## Development Guidelines + +### Code Style + +TermScrape follows PEP8 with these specifications: + +- **Line length**: 88 characters (Black default) +- **Formatter**: Black +- **Linter**: Flake8 +- **Type hints**: Use type hints for all functions + +```bash +# Format code +black src/ tests/ + +# Check linting +flake8 src/ tests/ + +# Type checking +mypy src/ +``` + +### Project Structure + +``` +termscrape/ +├── src/ # Source code +│ ├── main.py # CLI entry point +│ ├── scraper.py # Single URL scraping +│ ├── crawler.py # Multi-page crawling +│ ├── parser.py # HTML parsing +│ ├── browser.py # Playwright integration +│ ├── llm_extract.py # LLM extraction +│ └── utils.py # Helper functions +├── tests/ # Test files +├── docs/ # Documentation +└── examples/ # Example scripts +``` + +### Coding Standards + +#### 1. Modularity + +Each module should handle one concern: + +```python +# Good +from src.scraper import scrape_url +from src.parser import parse_html + +# Bad - mixing concerns +from src.everything import do_everything +``` + +#### 2. Type Hints + +Use type hints for clarity: + +```python +from typing import Optional, List, Dict + +def scrape_url( + url: str, + output_format: str = "markdown", + use_js: bool = False +) -> str: + """Scrape a URL and return content.""" + pass +``` + +#### 3. Error Handling + +Handle errors gracefully with specific exceptions: + +```python +try: + response = requests.get(url, timeout=30) + response.raise_for_status() +except Timeout: + raise ScraperError(f"Request timeout for {url}") +except RequestException as e: + raise ScraperError(f"Request failed: {e}") +``` + +#### 4. Logging + +Use the logging module, not print statements: + +```python +import logging + +logger = logging.getLogger("termscrape") + +logger.info("Starting scrape...") +logger.debug(f"Fetched {len(content)} bytes") +logger.error(f"Failed to scrape: {e}") +``` + +#### 5. Documentation + +Document all public functions and classes: + +```python +def scrape_url(url: str, output_format: str = "markdown") -> str: + """ + Scrape a single URL. + + Args: + url: URL to scrape + output_format: Output format ('markdown', 'json', 'text') + + Returns: + Scraped content in specified format + + Raises: + ScraperError: If scraping fails + """ + pass +``` + +## Testing + +### Writing Tests + +All new features must include tests. Use pytest: + +```python +# tests/test_myfeature.py +import pytest +from src.mymodule import my_function + +def test_my_function(): + """Test that my_function works correctly.""" + result = my_function("input") + assert result == "expected" + +def test_my_function_error(): + """Test error handling.""" + with pytest.raises(ValueError): + my_function(None) +``` + +### Running Tests + +```bash +# Run all tests +pytest + +# Run with coverage +pytest --cov=src --cov-report=term-missing + +# Run specific test file +pytest tests/test_scraper.py + +# Run specific test +pytest tests/test_scraper.py::test_scrape_url +``` + +### Test Coverage + +Aim for 80%+ code coverage. Check with: + +```bash +pytest --cov=src --cov-report=html +# Open htmlcov/index.html in browser +``` + +## Ethical Scraping Guidelines + +All contributions must respect ethical scraping practices: + +### 1. Robots.txt + +Always check robots.txt before scraping: + +```python +from src.utils import check_robots_txt + +if not check_robots_txt(url): + raise ScraperError("Disallowed by robots.txt") +``` + +### 2. Rate Limiting + +Add delays between requests: + +```python +from src.utils import random_delay + +random_delay(min_seconds=1, max_seconds=5) +``` + +### 3. User-Agent + +Always identify your scraper: + +```python +headers = {"User-Agent": "TermScrape/1.0 (+your.email)"} +``` + +### 4. Respect Terms of Service + +- Don't scrape login-protected content +- Don't bypass paywalls +- Don't scrape personal data without consent +- Don't use scraped data for illegal purposes + +## Pull Request Process + +### 1. Before Submitting + +- [ ] Code follows style guidelines (run Black) +- [ ] All tests pass (run pytest) +- [ ] New features have tests +- [ ] Documentation is updated +- [ ] Commit messages are clear + +### 2. Commit Messages + +Use clear, descriptive commit messages: + +```bash +# Good +git commit -m "Add support for custom timeout in scraper" +git commit -m "Fix robots.txt parsing for URLs with ports" + +# Bad +git commit -m "Fix bug" +git commit -m "Update code" +``` + +### 3. Submit Pull Request + +1. Push your branch to your fork +2. Open a pull request on GitHub +3. Describe your changes clearly +4. Link any related issues +5. Wait for review + +### 4. PR Template + +```markdown +## Description +Brief description of changes + +## Type of Change +- [ ] Bug fix +- [ ] New feature +- [ ] Breaking change +- [ ] Documentation update + +## Testing +How was this tested? + +## Checklist +- [ ] Tests pass +- [ ] Code formatted with Black +- [ ] Documentation updated +``` + +## Feature Requests + +Have an idea? Open an issue: + +1. Check if it already exists +2. Describe the feature clearly +3. Explain the use case +4. Provide examples if possible + +## Bug Reports + +Found a bug? Report it: + +1. Check if it's already reported +2. Provide clear steps to reproduce +3. Include error messages +4. Specify your environment (OS, Python version) + +### Bug Report Template + +```markdown +## Bug Description +Clear description of the bug + +## Steps to Reproduce +1. Run command... +2. See error... + +## Expected Behavior +What should happen + +## Actual Behavior +What actually happens + +## Environment +- OS: +- Python version: +- TermScrape version: + +## Error Output +``` +Paste error messages here +``` +``` + +## Development Workflow + +### Adding a New Feature + +1. Create an issue to discuss the feature +2. Get approval from maintainers +3. Create a branch: `feature/feature-name` +4. Implement the feature with tests +5. Update documentation +6. Submit pull request + +### Fixing a Bug + +1. Create an issue (if not exists) +2. Create a branch: `fix/bug-description` +3. Write a failing test that reproduces the bug +4. Fix the bug +5. Ensure test passes +6. Submit pull request + +## Code Review + +### As a Reviewer + +- Be constructive and respectful +- Test the changes locally +- Check code quality and tests +- Suggest improvements clearly + +### As an Author + +- Respond to feedback promptly +- Don't take criticism personally +- Make requested changes +- Thank reviewers for their time + +## Documentation + +Update documentation when: + +- Adding new features +- Changing CLI commands +- Updating dependencies +- Fixing bugs that affect usage + +## Questions? + +- Open an issue for questions +- Check existing documentation +- Review closed issues for similar problems + +## Recognition + +Contributors will be: + +- Listed in CONTRIBUTORS.md +- Mentioned in release notes +- Appreciated in the community! + +Thank you for contributing to TermScrape! diff --git a/termscrape/docs/usage.md b/termscrape/docs/usage.md new file mode 100644 index 0000000..07632c6 --- /dev/null +++ b/termscrape/docs/usage.md @@ -0,0 +1,289 @@ +# TermScrape Usage Guide + +## Installation + +```bash +# Clone the repository +git clone https://github.com/yourusername/termscrape +cd termscrape + +# Install dependencies +pip install -r requirements.txt + +# Install Playwright browsers +playwright install +``` + +## CLI Commands + +TermScrape provides two main commands: `scrape` and `crawl`. + +### Scrape Command + +Scrape a single URL and output its content. + +```bash +python -m src.main scrape --url [OPTIONS] +``` + +**Options:** + +- `--url` (required): URL to scrape +- `--format`: Output format - `markdown` (default), `json`, or `text` +- `--js`: Enable JavaScript rendering using headless browser +- `--output`, `-o`: Save to file instead of stdout +- `--user-agent`: Custom User-Agent string (default: "TermScrape/1.0") +- `--llm-prompt`: Extract data using LLM (requires Ollama) +- `--llm-model`: LLM model to use (default: "llama3") +- `--verbose`: Enable debug logging + +**Examples:** + +```bash +# Basic scrape to markdown +python -m src.main scrape --url https://example.com + +# Scrape with JavaScript rendering +python -m src.main scrape --url https://example.com --js + +# Scrape to JSON format +python -m src.main scrape --url https://example.com --format json + +# Save to file +python -m src.main scrape --url https://example.com --output page.md + +# Use LLM to extract specific data +python -m src.main scrape --url https://example.com \ + --llm-prompt "Extract all product titles and prices" +``` + +### Crawl Command + +Recursively crawl a website starting from a URL. + +```bash +python -m src.main crawl --url [OPTIONS] +``` + +**Options:** + +- `--url` (required): Starting URL +- `--depth`: Maximum crawl depth (default: 1) +- `--format`: Output format for each page +- `--js`: Enable JavaScript rendering +- `--output`, `-o`: Output directory or JSON file +- `--user-agent`: Custom User-Agent string +- `--exclude`: URL patterns to exclude (can use multiple times) +- `--llm-prompt`: Extract data from each page using LLM +- `--llm-model`: LLM model to use +- `--verbose`: Enable debug logging + +**Examples:** + +```bash +# Crawl with depth 2 +python -m src.main crawl --url https://example.com --depth 2 + +# Crawl and save each page to directory +python -m src.main crawl --url https://example.com --output ./scraped/ + +# Crawl with exclusions +python -m src.main crawl --url https://example.com \ + --exclude .pdf --exclude /login --exclude /admin + +# Crawl with LLM extraction +python -m src.main crawl --url https://example.com \ + --llm-prompt "Extract main headings and key points" \ + --output results.json +``` + +## Global Options + +These options work with any command: + +- `--verbose`: Enable detailed debug logging +- `--help`: Show help message + +## Output Formats + +### Markdown + +Clean, readable markdown format. Best for documentation and human reading. + +```bash +python -m src.main scrape --url https://example.com --format markdown +``` + +### JSON + +Structured JSON with metadata (title, headings, links, images, etc.). + +```bash +python -m src.main scrape --url https://example.com --format json +``` + +### Text + +Plain text extraction, removing all HTML tags. + +```bash +python -m src.main scrape --url https://example.com --format text +``` + +## JavaScript Rendering + +For dynamic websites that require JavaScript: + +```bash +python -m src.main scrape --url https://example.com --js +``` + +This uses Playwright's headless browser to render the page before scraping. + +## LLM Extraction + +TermScrape can use local LLMs via Ollama for intelligent data extraction. + +**Prerequisites:** + +1. Install Ollama: https://ollama.ai +2. Pull a model: `ollama pull llama3` +3. Install Python package: `pip install ollama` + +**Usage:** + +```bash +# Extract specific information +python -m src.main scrape --url https://news.example.com \ + --llm-prompt "Extract article title, author, and publication date" + +# Extract from crawled pages +python -m src.main crawl --url https://products.example.com \ + --depth 2 \ + --llm-prompt "Extract product name, price, and description" \ + --output products.json +``` + +## Ethical Scraping + +TermScrape includes built-in ethical scraping features: + +- **Robots.txt respect**: Automatically checks and respects robots.txt +- **Rate limiting**: Random delays (1-5 seconds) between requests +- **User-Agent**: Proper identification in requests + +To disable robots.txt checking (not recommended): + +```python +# In Python code +from src.scraper import Scraper +scraper = Scraper(respect_robots=False) +``` + +## Error Handling + +TermScrape handles common errors gracefully: + +- Network timeouts +- Invalid URLs +- Missing pages (404) +- Malformed HTML +- Robots.txt restrictions + +Errors are logged to stderr. Use `--verbose` for detailed error information. + +## Performance Tips + +1. **Use static requests when possible**: Only use `--js` for JavaScript-heavy sites +2. **Limit crawl depth**: Higher depths exponentially increase scraping time +3. **Use exclude patterns**: Skip unnecessary pages (PDFs, logins, etc.) +4. **Batch operations**: For multiple URLs, use the crawl command + +## Examples + +### Example 1: Blog Archive + +Scrape a blog and extract all articles: + +```bash +python -m src.main crawl \ + --url https://blog.example.com \ + --depth 2 \ + --exclude /tag/ --exclude /category/ \ + --format markdown \ + --output ./blog_archive/ +``` + +### Example 2: Product Catalog + +Extract product information with LLM: + +```bash +python -m src.main crawl \ + --url https://shop.example.com/products \ + --depth 1 \ + --js \ + --llm-prompt "Extract product name, price, description, and availability" \ + --output products.json +``` + +### Example 3: Research Paper + +Scrape a research paper with JavaScript: + +```bash +python -m src.main scrape \ + --url https://papers.example.com/paper/12345 \ + --js \ + --format markdown \ + --output paper.md +``` + +## Programmatic Usage + +You can also use TermScrape as a Python library: + +```python +from src import scrape_url, crawl_site + +# Scrape single URL +content = scrape_url( + url="https://example.com", + output_format="markdown", + use_js=False +) + +# Crawl website +results = crawl_site( + start_url="https://example.com", + max_depth=2, + output_format="json" +) +``` + +## Troubleshooting + +### Playwright Installation Issues + +If you get Playwright errors: + +```bash +playwright install chromium +``` + +### Ollama Connection Issues + +Make sure Ollama is running: + +```bash +ollama serve +``` + +### Permission Errors + +Some sites may block scrapers. Try using a custom user agent: + +```bash +python -m src.main scrape --url https://example.com \ + --user-agent "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" +``` diff --git a/termscrape/examples/simple_scrape.py b/termscrape/examples/simple_scrape.py new file mode 100755 index 0000000..fc11249 --- /dev/null +++ b/termscrape/examples/simple_scrape.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +""" +Simple scraping examples using TermScrape. + +This file demonstrates how to use TermScrape programmatically. +""" + +import sys +import os + +# Add parent directory to path to import src modules +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) + +from src import scrape_url, crawl_site +from src.scraper import Scraper +from src.parser import HTMLParser +from src.utils import setup_logging + + +def example_1_simple_scrape(): + """Example 1: Simple URL scraping to markdown.""" + print("=" * 60) + print("Example 1: Simple Scrape") + print("=" * 60) + + url = "https://example.com" + content = scrape_url(url, output_format="markdown") + + print(f"\nScraped {url}:\n") + print(content[:500]) # Print first 500 chars + print("\n...") + + +def example_2_json_output(): + """Example 2: Scraping with JSON output.""" + print("\n" + "=" * 60) + print("Example 2: JSON Output") + print("=" * 60) + + url = "https://example.com" + content = scrape_url(url, output_format="json") + + print(f"\nScraped {url} as JSON:\n") + print(content[:500]) + print("\n...") + + +def example_3_custom_scraper(): + """Example 3: Using custom scraper configuration.""" + print("\n" + "=" * 60) + print("Example 3: Custom Scraper") + print("=" * 60) + + # Create custom scraper + scraper = Scraper( + user_agent="MyCustomBot/1.0", + timeout=60, + respect_robots=True, + use_delay=False, # Disable delay for example + ) + + url = "https://example.com" + content = scraper.scrape(url, output_format="text") + + print(f"\nScraped with custom settings:\n") + print(content[:300]) + print("\n...") + + +def example_4_crawl_site(): + """Example 4: Crawling a website.""" + print("\n" + "=" * 60) + print("Example 4: Site Crawling") + print("=" * 60) + + url = "https://example.com" + results = crawl_site( + start_url=url, + max_depth=1, # Only crawl 1 level deep + output_format="markdown", + ) + + print(f"\nCrawled {len(results)} pages from {url}:\n") + for page_url, data in results.items(): + if "content" in data: + print(f"- {page_url} ({len(data['content'])} chars)") + else: + print(f"- {page_url} (error: {data.get('error', 'unknown')})") + + +def example_5_html_parser(): + """Example 5: Using HTMLParser directly.""" + print("\n" + "=" * 60) + print("Example 5: Direct HTML Parsing") + print("=" * 60) + + # Sample HTML + html = """ + + Test Page + +

Welcome to Test Page

+

This is a paragraph with some bold text.

+ + Link + + + """ + + parser = HTMLParser(html, url="https://example.com/test") + + # Get markdown + markdown = parser.to_markdown() + print("\nMarkdown output:") + print(markdown) + + # Get structured JSON + json_output = parser.to_json(structured=True) + print("\nStructured JSON:") + print(json_output[:400]) + print("\n...") + + +def example_6_with_excludes(): + """Example 6: Crawling with URL exclusions.""" + print("\n" + "=" * 60) + print("Example 6: Crawling with Exclusions") + print("=" * 60) + + url = "https://example.com" + results = crawl_site( + start_url=url, + max_depth=1, + output_format="markdown", + exclude_patterns=[".pdf", ".zip", "/login", "/admin"], + ) + + print(f"\nCrawled with exclusions: {len(results)} pages") + for page_url in list(results.keys())[:5]: # Show first 5 + print(f"- {page_url}") + + +def example_7_save_to_file(): + """Example 7: Scraping and saving to file.""" + print("\n" + "=" * 60) + print("Example 7: Save to File") + print("=" * 60) + + url = "https://example.com" + content = scrape_url(url, output_format="markdown") + + # Save to file + output_file = "example_output.md" + with open(output_file, "w", encoding="utf-8") as f: + f.write(content) + + print(f"\nSaved content to {output_file}") + print(f"File size: {len(content)} bytes") + + +def main(): + """Run all examples.""" + # Enable verbose logging + setup_logging(verbose=False) + + print("\n" + "=" * 60) + print("TermScrape Examples") + print("=" * 60) + print("\nThese examples demonstrate basic TermScrape usage.") + print("Some examples use https://example.com for safety.\n") + + try: + # Run examples + example_1_simple_scrape() + example_2_json_output() + example_3_custom_scraper() + example_4_crawl_site() + example_5_html_parser() + example_6_with_excludes() + example_7_save_to_file() + + print("\n" + "=" * 60) + print("All examples completed!") + print("=" * 60) + + except Exception as e: + print(f"\nError running examples: {e}") + print("Make sure you have internet connection and required dependencies.") + + +if __name__ == "__main__": + main() diff --git a/termscrape/pyproject.toml b/termscrape/pyproject.toml new file mode 100644 index 0000000..10c3bb8 --- /dev/null +++ b/termscrape/pyproject.toml @@ -0,0 +1,74 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "termscrape" +version = "1.0.0" +description = "Terminal-based web scraper with LLM support" +readme = "README.md" +requires-python = ">=3.10" +license = {text = "MIT"} +authors = [ + {name = "TermScrape Team", email = "contact@termscrape.dev"} +] +keywords = ["scraper", "crawler", "web-scraping", "llm", "cli"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", +] + +dependencies = [ + "click>=8.1.0", + "requests>=2.31.0", + "beautifulsoup4>=4.12.0", + "markdownify>=0.11.6", + "playwright>=1.40.0", +] + +[project.optional-dependencies] +llm = ["ollama>=0.1.0"] +dev = [ + "pytest>=7.4.0", + "pytest-cov>=4.1.0", + "pytest-asyncio>=0.21.0", + "black>=23.0.0", + "flake8>=6.0.0", + "mypy>=1.5.0", +] + +[project.urls] +Homepage = "https://github.com/yourusername/termscrape" +Repository = "https://github.com/yourusername/termscrape" +Issues = "https://github.com/yourusername/termscrape/issues" + +[project.scripts] +termscrape = "src.main:cli" + +[tool.setuptools] +packages = ["src"] + +[tool.black] +line-length = 88 +target-version = ["py310", "py311", "py312"] + +[tool.mypy] +python_version = "3.10" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +ignore_missing_imports = true + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = "test_*.py" +python_classes = "Test*" +python_functions = "test_*" +addopts = "-v --cov=src --cov-report=term-missing" diff --git a/termscrape/requirements.txt b/termscrape/requirements.txt new file mode 100644 index 0000000..96870b5 --- /dev/null +++ b/termscrape/requirements.txt @@ -0,0 +1,19 @@ +# Core dependencies +click>=8.1.0 +requests>=2.31.0 +beautifulsoup4>=4.12.0 +markdownify>=0.11.6 +playwright>=1.40.0 + +# Optional dependencies +ollama>=0.1.0 # For LLM extraction + +# Testing dependencies +pytest>=7.4.0 +pytest-cov>=4.1.0 +pytest-asyncio>=0.21.0 + +# Development dependencies +black>=23.0.0 +flake8>=6.0.0 +mypy>=1.5.0 diff --git a/termscrape/src/__init__.py b/termscrape/src/__init__.py new file mode 100644 index 0000000..9421244 --- /dev/null +++ b/termscrape/src/__init__.py @@ -0,0 +1,20 @@ +"""TermScrape - Terminal-based web scraper with LLM support.""" + +__version__ = "1.0.0" +__author__ = "TermScrape Team" + +from .scraper import scrape_url, Scraper +from .crawler import crawl_site, Crawler +from .parser import parse_html, HTMLParser +from .utils import setup_logging, check_robots_txt + +__all__ = [ + "scrape_url", + "Scraper", + "crawl_site", + "Crawler", + "parse_html", + "HTMLParser", + "setup_logging", + "check_robots_txt", +] diff --git a/termscrape/src/browser.py b/termscrape/src/browser.py new file mode 100644 index 0000000..ef29b51 --- /dev/null +++ b/termscrape/src/browser.py @@ -0,0 +1,167 @@ +"""Headless browser automation using Playwright for JS-heavy sites.""" + +import asyncio +import logging +from typing import Optional + +from playwright.async_api import async_playwright, Browser, Page, TimeoutError + + +logger = logging.getLogger("termscrape") + + +class HeadlessBrowser: + """Async headless browser for rendering JavaScript content.""" + + def __init__(self, user_agent: Optional[str] = None): + """ + Initialize headless browser. + + Args: + user_agent: Custom user agent string + """ + self.user_agent = user_agent or "TermScrape/1.0" + self.browser: Optional[Browser] = None + self.playwright = None + + async def __aenter__(self): + """Context manager entry.""" + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + await self.close() + + async def start(self) -> None: + """Start the browser instance.""" + self.playwright = await async_playwright().start() + self.browser = await self.playwright.chromium.launch(headless=True) + logger.debug("Headless browser started") + + async def close(self) -> None: + """Close the browser instance.""" + if self.browser: + await self.browser.close() + if self.playwright: + await self.playwright.stop() + logger.debug("Headless browser closed") + + async def fetch_page( + self, url: str, wait_until: str = "networkidle", timeout: int = 30000 + ) -> str: + """ + Fetch a page with JavaScript rendering. + + Args: + url: URL to fetch + wait_until: When to consider page loaded ('load', 'domcontentloaded', + 'networkidle') + timeout: Maximum time to wait in milliseconds + + Returns: + Rendered HTML content + + Raises: + TimeoutError: If page load times out + Exception: For other browser errors + """ + if not self.browser: + await self.start() + + page: Optional[Page] = None + try: + page = await self.browser.new_page(user_agent=self.user_agent) + + logger.info(f"Fetching {url} with JavaScript rendering...") + await page.goto(url, wait_until=wait_until, timeout=timeout) + + # Wait for page to be fully rendered + await page.wait_for_load_state("networkidle") + + # Get the fully rendered HTML + html_content = await page.content() + + logger.debug(f"Successfully fetched {url} ({len(html_content)} bytes)") + return html_content + + except TimeoutError: + logger.error(f"Timeout loading {url}") + raise TimeoutError(f"Page load timeout for {url}") + + except Exception as e: + logger.error(f"Error fetching {url}: {e}") + raise + + finally: + if page: + await page.close() + + async def fetch_with_wait( + self, url: str, selector: Optional[str] = None, timeout: int = 30000 + ) -> str: + """ + Fetch a page and wait for a specific element. + + Args: + url: URL to fetch + selector: CSS selector to wait for + timeout: Maximum time to wait in milliseconds + + Returns: + Rendered HTML content + """ + if not self.browser: + await self.start() + + page: Optional[Page] = None + try: + page = await self.browser.new_page(user_agent=self.user_agent) + + await page.goto(url, wait_until="domcontentloaded", timeout=timeout) + + # Wait for specific selector if provided + if selector: + await page.wait_for_selector(selector, timeout=timeout) + + html_content = await page.content() + return html_content + + finally: + if page: + await page.close() + + +async def fetch_with_js( + url: str, user_agent: Optional[str] = None, timeout: int = 30000 +) -> str: + """ + Convenience function to fetch a single page with JS rendering. + + Args: + url: URL to fetch + user_agent: Custom user agent + timeout: Timeout in milliseconds + + Returns: + Rendered HTML content + """ + async with HeadlessBrowser(user_agent=user_agent) as browser: + return await browser.fetch_page(url, timeout=timeout) + + +def fetch_with_js_sync( + url: str, user_agent: Optional[str] = None, timeout: int = 30000 +) -> str: + """ + Synchronous wrapper for fetching with JS. + + Args: + url: URL to fetch + user_agent: Custom user agent + timeout: Timeout in milliseconds + + Returns: + Rendered HTML content + """ + return asyncio.run(fetch_with_js(url, user_agent, timeout)) diff --git a/termscrape/src/crawler.py b/termscrape/src/crawler.py new file mode 100644 index 0000000..b9bf0d9 --- /dev/null +++ b/termscrape/src/crawler.py @@ -0,0 +1,221 @@ +"""Recursive web crawling functionality.""" + +import logging +from collections import deque +from typing import Dict, List, Optional, Set +from urllib.parse import urljoin, urlparse + +from bs4 import BeautifulSoup + +from .scraper import Scraper, ScraperError +from .utils import normalize_url + + +logger = logging.getLogger("termscrape") + + +class Crawler: + """Crawl a website recursively with depth control.""" + + def __init__( + self, + user_agent: str = "TermScrape/1.0", + max_depth: int = 1, + respect_robots: bool = True, + same_domain_only: bool = True, + exclude_patterns: Optional[List[str]] = None, + ): + """ + Initialize crawler. + + Args: + user_agent: User agent string + max_depth: Maximum crawl depth (0 = only start URL) + respect_robots: Check robots.txt + same_domain_only: Only crawl URLs from same domain + exclude_patterns: URL patterns to exclude (e.g., ['.pdf', '/login']) + """ + self.user_agent = user_agent + self.max_depth = max_depth + self.respect_robots = respect_robots + self.same_domain_only = same_domain_only + self.exclude_patterns = exclude_patterns or [] + + self.scraper = Scraper( + user_agent=user_agent, + respect_robots=respect_robots, + ) + + self.visited: Set[str] = set() + self.results: Dict[str, Dict] = {} + + def crawl( + self, + start_url: str, + output_format: str = "markdown", + use_js: bool = False, + ) -> Dict[str, Dict]: + """ + Crawl a website starting from a URL. + + Args: + start_url: Starting URL + output_format: Output format for each page + use_js: Use JavaScript rendering + + Returns: + Dictionary mapping URLs to their scraped content and metadata + """ + logger.info( + f"Starting crawl from {start_url} (max depth: {self.max_depth})" + ) + + # Queue: (url, depth) + queue: deque = deque([(start_url, 0)]) + base_domain = urlparse(start_url).netloc + + while queue: + current_url, depth = queue.popleft() + + # Skip if already visited + if current_url in self.visited: + continue + + # Skip if depth exceeded + if depth > self.max_depth: + continue + + # Skip if matches exclude patterns + if self._should_exclude(current_url): + logger.debug(f"Excluding {current_url} (matches exclude pattern)") + continue + + # Skip if different domain and same_domain_only is True + if self.same_domain_only: + if urlparse(current_url).netloc != base_domain: + logger.debug( + f"Skipping {current_url} (different domain)" + ) + continue + + # Mark as visited + self.visited.add(current_url) + + # Scrape the page + try: + logger.info(f"Crawling {current_url} (depth: {depth})") + content = self.scraper.scrape( + current_url, output_format, use_js + ) + + self.results[current_url] = { + "content": content, + "depth": depth, + "format": output_format, + } + + # Extract links if not at max depth + if depth < self.max_depth: + links = self._extract_links(current_url, content, use_js) + for link in links: + normalized_link = normalize_url(link, current_url) + if normalized_link not in self.visited: + queue.append((normalized_link, depth + 1)) + + except ScraperError as e: + logger.warning(f"Failed to crawl {current_url}: {e}") + self.results[current_url] = { + "error": str(e), + "depth": depth, + } + + logger.info( + f"Crawl complete. Visited {len(self.visited)} pages, " + f"successfully scraped {len([r for r in self.results.values() if 'content' in r])} pages" + ) + + return self.results + + def _extract_links( + self, base_url: str, content: str, use_js: bool + ) -> List[str]: + """ + Extract links from scraped content. + + Args: + base_url: Base URL for resolving relative links + content: Scraped content (markdown or JSON format) + use_js: Whether JS was used (affects parsing) + + Returns: + List of absolute URLs + """ + links: List[str] = [] + + # Re-fetch HTML to extract links (content might be markdown/json) + try: + if use_js: + from .browser import fetch_with_js_sync + + html = fetch_with_js_sync(base_url, self.user_agent) + else: + html = self.scraper._fetch_static(base_url) + + soup = BeautifulSoup(html, "html.parser") + + for link_tag in soup.find_all("a", href=True): + href = link_tag.get("href") + if href: + # Convert relative to absolute + absolute_url = urljoin(base_url, href) + links.append(absolute_url) + + except Exception as e: + logger.warning(f"Failed to extract links from {base_url}: {e}") + + return links + + def _should_exclude(self, url: str) -> bool: + """ + Check if URL matches any exclude pattern. + + Args: + url: URL to check + + Returns: + True if URL should be excluded + """ + for pattern in self.exclude_patterns: + if pattern in url: + return True + return False + + +def crawl_site( + start_url: str, + max_depth: int = 1, + output_format: str = "markdown", + use_js: bool = False, + user_agent: str = "TermScrape/1.0", + exclude_patterns: Optional[List[str]] = None, +) -> Dict[str, Dict]: + """ + Convenience function to crawl a website. + + Args: + start_url: Starting URL + max_depth: Maximum crawl depth + output_format: Output format + use_js: Use JavaScript rendering + user_agent: Custom user agent + exclude_patterns: URL patterns to exclude + + Returns: + Dictionary of crawled results + """ + crawler = Crawler( + user_agent=user_agent, + max_depth=max_depth, + exclude_patterns=exclude_patterns, + ) + return crawler.crawl(start_url, output_format, use_js) diff --git a/termscrape/src/llm_extract.py b/termscrape/src/llm_extract.py new file mode 100644 index 0000000..b37e1d8 --- /dev/null +++ b/termscrape/src/llm_extract.py @@ -0,0 +1,221 @@ +"""LLM-based data extraction using Ollama.""" + +import json +import logging +from typing import Any, Dict, Optional + +try: + import ollama + + OLLAMA_AVAILABLE = True +except ImportError: + OLLAMA_AVAILABLE = False + logger = logging.getLogger("termscrape") + logger.warning( + "Ollama not installed. LLM extraction disabled. " + "Install with: pip install ollama" + ) + + +logger = logging.getLogger("termscrape") + + +class LLMExtractor: + """Extract structured data from content using local LLM.""" + + def __init__(self, model: str = "llama3"): + """ + Initialize LLM extractor. + + Args: + model: Ollama model name (e.g., 'llama3', 'mistral') + + Raises: + RuntimeError: If Ollama is not available + """ + if not OLLAMA_AVAILABLE: + raise RuntimeError( + "Ollama is not installed. Install with: pip install ollama" + ) + + self.model = model + self._verify_model() + + def _verify_model(self) -> None: + """Verify that the specified model is available.""" + try: + # List available models + models = ollama.list() + model_names = [m["name"] for m in models.get("models", [])] + + if not any(self.model in name for name in model_names): + logger.warning( + f"Model '{self.model}' not found locally. " + f"Available models: {model_names}" + ) + logger.info(f"Attempting to pull model '{self.model}'...") + # Model will be pulled on first use + except Exception as e: + logger.warning(f"Could not verify model availability: {e}") + + def extract(self, content: str, prompt: str) -> str: + """ + Extract information from content using LLM. + + Args: + content: Text content to extract from + prompt: Extraction prompt (e.g., "Extract all product titles") + + Returns: + LLM response as string + """ + full_prompt = self._build_prompt(content, prompt) + + try: + logger.info(f"Sending extraction request to {self.model}...") + response = ollama.generate(model=self.model, prompt=full_prompt) + + result = response.get("response", "") + logger.debug(f"LLM extraction complete ({len(result)} characters)") + + return result + + except Exception as e: + logger.error(f"LLM extraction failed: {e}") + raise RuntimeError(f"LLM extraction failed: {e}") + + def extract_structured( + self, content: str, prompt: str, format: str = "json" + ) -> Any: + """ + Extract structured data and parse as JSON. + + Args: + content: Text content to extract from + prompt: Extraction prompt + format: Expected output format ('json' or 'text') + + Returns: + Parsed JSON object or text string + """ + if format == "json": + enhanced_prompt = ( + f"{prompt}\n\n" + "Respond ONLY with valid JSON. No explanations." + ) + else: + enhanced_prompt = prompt + + result = self.extract(content, enhanced_prompt) + + if format == "json": + try: + return json.loads(result) + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse JSON from LLM: {e}") + # Try to extract JSON from markdown code blocks + result = self._extract_json_from_markdown(result) + return json.loads(result) + + return result + + def _build_prompt(self, content: str, user_prompt: str) -> str: + """ + Build the full prompt for the LLM. + + Args: + content: Content to extract from + user_prompt: User's extraction instruction + + Returns: + Complete prompt string + """ + # Truncate content if too long (most models have context limits) + max_content_length = 4000 + if len(content) > max_content_length: + logger.warning( + f"Content truncated from {len(content)} to " + f"{max_content_length} characters" + ) + content = content[:max_content_length] + "..." + + prompt = f"""You are a data extraction assistant. Extract information from the following content based on the user's request. + +User Request: {user_prompt} + +Content: +{content} + +Provide the extracted information:""" + + return prompt + + def _extract_json_from_markdown(self, text: str) -> str: + """ + Extract JSON from markdown code blocks. + + Args: + text: Text potentially containing markdown code blocks + + Returns: + Extracted JSON string + """ + # Try to find JSON in ```json ``` blocks + import re + + json_pattern = r"```(?:json)?\s*(\{.*?\}|\[.*?\])\s*```" + matches = re.findall(json_pattern, text, re.DOTALL) + + if matches: + return matches[0] + + # If no code blocks, return original + return text + + +def extract_with_llm( + content: str, + prompt: str, + model: str = "llama3", + structured: bool = False, +) -> Any: + """ + Convenience function for LLM extraction. + + Args: + content: Content to extract from + prompt: Extraction prompt + model: Ollama model name + structured: Return structured JSON if True + + Returns: + Extracted data (string or dict) + """ + if not OLLAMA_AVAILABLE: + raise RuntimeError( + "Ollama not available. Install with: pip install ollama" + ) + + extractor = LLMExtractor(model=model) + + if structured: + return extractor.extract_structured(content, prompt, format="json") + else: + return extractor.extract(content, prompt) + + +def is_ollama_available() -> bool: + """ + Check if Ollama is available. + + Returns: + True if Ollama is installed and accessible + """ + if not OLLAMA_AVAILABLE: + return False + + try: + ollama.list() + return True + except Exception: + return False diff --git a/termscrape/src/main.py b/termscrape/src/main.py new file mode 100644 index 0000000..2f0926e --- /dev/null +++ b/termscrape/src/main.py @@ -0,0 +1,253 @@ +"""TermScrape CLI - Terminal-based web scraper.""" + +import json +import sys +from pathlib import Path +from typing import Optional + +import click + +from .crawler import crawl_site +from .llm_extract import extract_with_llm, is_ollama_available +from .scraper import scrape_url +from .utils import sanitize_filename, setup_logging + + +@click.group() +@click.option( + "--verbose", is_flag=True, help="Enable verbose logging (DEBUG level)" +) +@click.pass_context +def cli(ctx, verbose: bool): + """TermScrape: Terminal-based web scraper with LLM support.""" + ctx.ensure_object(dict) + ctx.obj["verbose"] = verbose + setup_logging(verbose) + + +@cli.command() +@click.option("--url", required=True, help="URL to scrape") +@click.option( + "--format", + type=click.Choice(["markdown", "json", "text"], case_sensitive=False), + default="markdown", + help="Output format", +) +@click.option("--js", is_flag=True, help="Enable JavaScript rendering") +@click.option( + "--output", "-o", type=click.Path(), help="Output file (default: stdout)" +) +@click.option( + "--user-agent", + default="TermScrape/1.0", + help="Custom User-Agent string", +) +@click.option( + "--llm-prompt", + help="Optional LLM extraction prompt (requires Ollama)", +) +@click.option( + "--llm-model", + default="llama3", + help="LLM model to use for extraction (default: llama3)", +) +@click.pass_context +def scrape( + ctx, + url: str, + format: str, + js: bool, + output: Optional[str], + user_agent: str, + llm_prompt: Optional[str], + llm_model: str, +): + """Scrape a single URL and output content.""" + try: + # Scrape the URL + click.echo(f"Scraping {url}...", err=True) + content = scrape_url( + url=url, + output_format=format, + use_js=js, + user_agent=user_agent, + ) + + # Apply LLM extraction if requested + if llm_prompt: + if not is_ollama_available(): + click.echo( + "Error: Ollama is not available. " + "Install with: pip install ollama", + err=True, + ) + sys.exit(1) + + click.echo(f"Extracting with LLM ({llm_model})...", err=True) + content = extract_with_llm( + content=content, + prompt=llm_prompt, + model=llm_model, + structured=False, + ) + + # Output result + if output: + output_path = Path(output) + output_path.write_text(content, encoding="utf-8") + click.echo(f"Saved to {output}", err=True) + else: + click.echo(content) + + except Exception as e: + click.echo(f"Error: {e}", err=True) + sys.exit(1) + + +@cli.command() +@click.option("--url", required=True, help="Starting URL to crawl") +@click.option( + "--depth", + type=int, + default=1, + help="Maximum crawl depth (default: 1)", +) +@click.option( + "--format", + type=click.Choice(["markdown", "json", "text"], case_sensitive=False), + default="markdown", + help="Output format for each page", +) +@click.option("--js", is_flag=True, help="Enable JavaScript rendering") +@click.option( + "--output", "-o", type=click.Path(), help="Output directory or file" +) +@click.option( + "--user-agent", + default="TermScrape/1.0", + help="Custom User-Agent string", +) +@click.option( + "--exclude", + multiple=True, + help="URL patterns to exclude (can be used multiple times)", +) +@click.option( + "--llm-prompt", + help="Optional LLM extraction prompt (requires Ollama)", +) +@click.option( + "--llm-model", + default="llama3", + help="LLM model to use for extraction (default: llama3)", +) +@click.pass_context +def crawl( + ctx, + url: str, + depth: int, + format: str, + js: bool, + output: Optional[str], + user_agent: str, + exclude: tuple, + llm_prompt: Optional[str], + llm_model: str, +): + """Crawl a website recursively.""" + try: + # Crawl the site + click.echo( + f"Crawling {url} (depth: {depth})...", err=True + ) + results = crawl_site( + start_url=url, + max_depth=depth, + output_format=format, + use_js=js, + user_agent=user_agent, + exclude_patterns=list(exclude) if exclude else None, + ) + + # Apply LLM extraction if requested + if llm_prompt: + if not is_ollama_available(): + click.echo( + "Error: Ollama is not available. " + "Install with: pip install ollama", + err=True, + ) + sys.exit(1) + + click.echo( + f"Extracting with LLM ({llm_model})...", err=True + ) + for page_url, data in results.items(): + if "content" in data: + data["llm_extraction"] = extract_with_llm( + content=data["content"], + prompt=llm_prompt, + model=llm_model, + structured=False, + ) + + # Output results + if output: + output_path = Path(output) + + # If output is a directory, save each page as separate file + if output_path.suffix == "" or output_path.is_dir(): + output_path.mkdir(parents=True, exist_ok=True) + + for page_url, data in results.items(): + if "content" not in data: + continue + + # Create filename from URL + filename = sanitize_filename(page_url) + if format == "markdown": + filename += ".md" + elif format == "json": + filename += ".json" + else: + filename += ".txt" + + file_path = output_path / filename + file_path.write_text( + data["content"], encoding="utf-8" + ) + + click.echo( + f"Saved {len(results)} pages to {output_path}", + err=True, + ) + + else: + # Save as single JSON file + output_path.write_text( + json.dumps(results, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + click.echo(f"Saved to {output}", err=True) + + else: + # Print to stdout as JSON + click.echo(json.dumps(results, indent=2, ensure_ascii=False)) + + click.echo( + f"Crawled {len(results)} pages successfully.", err=True + ) + + except Exception as e: + click.echo(f"Error: {e}", err=True) + sys.exit(1) + + +@cli.command() +def version(): + """Show TermScrape version.""" + click.echo("TermScrape v1.0.0") + + +if __name__ == "__main__": + cli(obj={}) diff --git a/termscrape/src/parser.py b/termscrape/src/parser.py new file mode 100644 index 0000000..b94547f --- /dev/null +++ b/termscrape/src/parser.py @@ -0,0 +1,195 @@ +"""HTML parsing and output formatting for TermScrape.""" + +import json +import logging +from typing import Any, Dict, List, Optional + +from bs4 import BeautifulSoup +from markdownify import markdownify as md + + +logger = logging.getLogger("termscrape") + + +class HTMLParser: + """Parse HTML content and convert to various formats.""" + + def __init__(self, html_content: str, url: Optional[str] = None): + """ + Initialize parser with HTML content. + + Args: + html_content: Raw HTML string + url: Source URL (optional, for metadata) + """ + self.html_content = html_content + self.url = url + self.soup = BeautifulSoup(html_content, "html.parser") + + def to_markdown(self, strip_tags: Optional[List[str]] = None) -> str: + """ + Convert HTML to markdown format. + + Args: + strip_tags: List of HTML tags to remove before conversion + + Returns: + Markdown formatted string + """ + soup_copy = BeautifulSoup(str(self.soup), "html.parser") + + # Remove unwanted tags (scripts, styles, etc.) + default_strip = ["script", "style", "nav", "footer", "header"] + tags_to_strip = strip_tags or default_strip + + for tag in tags_to_strip: + for element in soup_copy.find_all(tag): + element.decompose() + + # Convert to markdown + markdown_content = md( + str(soup_copy), + heading_style="ATX", + bullets="-", + strip=["img"], + ) + + return markdown_content.strip() + + def to_json(self, structured: bool = True) -> str: + """ + Convert HTML to JSON format. + + Args: + structured: If True, extract structured data; else raw HTML dict + + Returns: + JSON formatted string + """ + if structured: + data = self._extract_structured_data() + else: + data = {"url": self.url, "html": self.html_content} + + return json.dumps(data, indent=2, ensure_ascii=False) + + def _extract_structured_data(self) -> Dict[str, Any]: + """ + Extract structured data from HTML. + + Returns: + Dictionary containing title, headings, paragraphs, links, etc. + """ + data: Dict[str, Any] = { + "url": self.url, + "title": self._get_title(), + "meta": self._get_meta_tags(), + "headings": self._get_headings(), + "paragraphs": self._get_paragraphs(), + "links": self._get_links(), + "images": self._get_images(), + } + + return data + + def _get_title(self) -> Optional[str]: + """Extract page title.""" + title_tag = self.soup.find("title") + return title_tag.get_text(strip=True) if title_tag else None + + def _get_meta_tags(self) -> Dict[str, str]: + """Extract meta tags.""" + meta_tags = {} + for meta in self.soup.find_all("meta"): + name = meta.get("name") or meta.get("property") + content = meta.get("content") + if name and content: + meta_tags[name] = content + return meta_tags + + def _get_headings(self) -> List[Dict[str, Any]]: + """Extract all headings (h1-h6).""" + headings = [] + for level in range(1, 7): + for heading in self.soup.find_all(f"h{level}"): + headings.append( + {"level": level, "text": heading.get_text(strip=True)} + ) + return headings + + def _get_paragraphs(self) -> List[str]: + """Extract all paragraph text.""" + paragraphs = [] + for p in self.soup.find_all("p"): + text = p.get_text(strip=True) + if text: + paragraphs.append(text) + return paragraphs + + def _get_links(self) -> List[Dict[str, Optional[str]]]: + """Extract all links.""" + links = [] + for link in self.soup.find_all("a", href=True): + links.append( + { + "text": link.get_text(strip=True), + "href": link.get("href"), + "title": link.get("title"), + } + ) + return links + + def _get_images(self) -> List[Dict[str, Optional[str]]]: + """Extract all images.""" + images = [] + for img in self.soup.find_all("img"): + images.append( + { + "src": img.get("src"), + "alt": img.get("alt"), + "title": img.get("title"), + } + ) + return images + + def get_text(self) -> str: + """ + Extract plain text from HTML. + + Returns: + Plain text content + """ + # Remove scripts and styles + for element in self.soup(["script", "style"]): + element.decompose() + + return self.soup.get_text(separator="\n", strip=True) + + +def parse_html( + html_content: str, + url: Optional[str] = None, + output_format: str = "markdown", +) -> str: + """ + Parse HTML and convert to specified format. + + Args: + html_content: Raw HTML string + url: Source URL + output_format: Output format ('markdown', 'json', or 'text') + + Returns: + Formatted content as string + """ + parser = HTMLParser(html_content, url) + + if output_format == "markdown": + return parser.to_markdown() + elif output_format == "json": + return parser.to_json(structured=True) + elif output_format == "text": + return parser.get_text() + else: + logger.warning(f"Unknown format '{output_format}', using markdown") + return parser.to_markdown() diff --git a/termscrape/src/scraper.py b/termscrape/src/scraper.py new file mode 100644 index 0000000..f1a339f --- /dev/null +++ b/termscrape/src/scraper.py @@ -0,0 +1,161 @@ +"""Single URL scraping functionality.""" + +import logging +from typing import Optional + +import requests +from requests.exceptions import RequestException, Timeout + +from .browser import fetch_with_js_sync +from .parser import parse_html +from .utils import check_robots_txt, is_valid_url, random_delay + + +logger = logging.getLogger("termscrape") + + +class ScraperError(Exception): + """Base exception for scraper errors.""" + + pass + + +class Scraper: + """Scrape content from a single URL.""" + + def __init__( + self, + user_agent: str = "TermScrape/1.0", + timeout: int = 30, + respect_robots: bool = True, + use_delay: bool = True, + ): + """ + Initialize scraper. + + Args: + user_agent: User agent string for requests + timeout: Request timeout in seconds + respect_robots: Check robots.txt before scraping + use_delay: Add random delay between requests + """ + self.user_agent = user_agent + self.timeout = timeout + self.respect_robots = respect_robots + self.use_delay = use_delay + self.session = requests.Session() + self.session.headers.update({"User-Agent": self.user_agent}) + + def scrape( + self, + url: str, + output_format: str = "markdown", + use_js: bool = False, + ) -> str: + """ + Scrape a single URL. + + Args: + url: URL to scrape + output_format: Output format ('markdown', 'json', 'text') + use_js: Use headless browser for JavaScript rendering + + Returns: + Scraped content in specified format + + Raises: + ScraperError: If scraping fails + """ + # Validate URL + if not is_valid_url(url): + raise ScraperError(f"Invalid URL: {url}") + + # Check robots.txt + if self.respect_robots: + if not check_robots_txt(url, self.user_agent): + raise ScraperError( + f"Scraping {url} is disallowed by robots.txt" + ) + + # Add delay if enabled + if self.use_delay: + random_delay() + + # Fetch content + try: + if use_js: + logger.info(f"Fetching {url} with JavaScript rendering...") + html_content = fetch_with_js_sync( + url, self.user_agent, self.timeout * 1000 + ) + else: + logger.info(f"Fetching {url} with static request...") + html_content = self._fetch_static(url) + + # Parse and format + result = parse_html(html_content, url, output_format) + logger.info(f"Successfully scraped {url}") + + return result + + except Exception as e: + logger.error(f"Failed to scrape {url}: {e}") + raise ScraperError(f"Failed to scrape {url}: {e}") + + def _fetch_static(self, url: str) -> str: + """ + Fetch page content using static HTTP request. + + Args: + url: URL to fetch + + Returns: + HTML content + + Raises: + ScraperError: If request fails + """ + try: + response = self.session.get(url, timeout=self.timeout) + response.raise_for_status() + + # Check content type + content_type = response.headers.get("Content-Type", "") + if "text/html" not in content_type: + logger.warning( + f"URL {url} returned non-HTML content: {content_type}" + ) + + return response.text + + except Timeout: + raise ScraperError(f"Request timeout for {url}") + except RequestException as e: + raise ScraperError(f"Request failed for {url}: {e}") + + +def scrape_url( + url: str, + output_format: str = "markdown", + use_js: bool = False, + user_agent: str = "TermScrape/1.0", + respect_robots: bool = True, +) -> str: + """ + Convenience function to scrape a single URL. + + Args: + url: URL to scrape + output_format: Output format ('markdown', 'json', 'text') + use_js: Use JavaScript rendering + user_agent: Custom user agent + respect_robots: Check robots.txt + + Returns: + Scraped content in specified format + """ + scraper = Scraper( + user_agent=user_agent, + respect_robots=respect_robots, + ) + return scraper.scrape(url, output_format, use_js) diff --git a/termscrape/src/utils.py b/termscrape/src/utils.py new file mode 100644 index 0000000..24d8f6e --- /dev/null +++ b/termscrape/src/utils.py @@ -0,0 +1,123 @@ +"""Utility functions for TermScrape.""" + +import logging +import random +import time +from typing import Optional +from urllib.parse import urlparse +from urllib.robotparser import RobotFileParser + + +def setup_logging(verbose: bool = False) -> logging.Logger: + """ + Set up logging configuration. + + Args: + verbose: Enable verbose logging (DEBUG level) if True + + Returns: + Configured logger instance + """ + level = logging.DEBUG if verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + handlers=[logging.StreamHandler()], + ) + return logging.getLogger("termscrape") + + +def random_delay(min_seconds: float = 1.0, max_seconds: float = 5.0) -> None: + """ + Sleep for a random duration to avoid rate limiting. + + Args: + min_seconds: Minimum delay in seconds + max_seconds: Maximum delay in seconds + """ + delay = random.uniform(min_seconds, max_seconds) + time.sleep(delay) + + +def check_robots_txt(url: str, user_agent: str = "TermScrape/1.0") -> bool: + """ + Check if URL is allowed to be scraped according to robots.txt. + + Args: + url: Target URL to check + user_agent: User agent string to use for checking + + Returns: + True if scraping is allowed, False otherwise + """ + try: + parsed_url = urlparse(url) + robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" + + rp = RobotFileParser() + rp.set_url(robots_url) + rp.read() + + return rp.can_fetch(user_agent, url) + except Exception as e: + # If robots.txt cannot be fetched, allow by default + logging.getLogger("termscrape").warning( + f"Could not fetch robots.txt for {url}: {e}" + ) + return True + + +def sanitize_filename(filename: str) -> str: + """ + Sanitize a filename by removing/replacing invalid characters. + + Args: + filename: Original filename + + Returns: + Sanitized filename safe for filesystem + """ + invalid_chars = '<>:"/\\|?*' + for char in invalid_chars: + filename = filename.replace(char, "_") + return filename + + +def normalize_url(url: str, base_url: Optional[str] = None) -> str: + """ + Normalize a URL by handling relative paths and fragments. + + Args: + url: URL to normalize + base_url: Base URL for resolving relative paths + + Returns: + Normalized absolute URL + """ + from urllib.parse import urljoin, urldefrag + + # Remove fragment + url, _ = urldefrag(url) + + # Resolve relative URLs + if base_url: + url = urljoin(base_url, url) + + return url + + +def is_valid_url(url: str) -> bool: + """ + Validate if string is a properly formed URL. + + Args: + url: URL string to validate + + Returns: + True if URL is valid, False otherwise + """ + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except Exception: + return False diff --git a/termscrape/tests/__init__.py b/termscrape/tests/__init__.py new file mode 100644 index 0000000..11a6609 --- /dev/null +++ b/termscrape/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for TermScrape.""" diff --git a/termscrape/tests/test_crawler.py b/termscrape/tests/test_crawler.py new file mode 100644 index 0000000..175ca37 --- /dev/null +++ b/termscrape/tests/test_crawler.py @@ -0,0 +1,107 @@ +"""Tests for crawler module.""" + +import pytest +from unittest.mock import Mock, patch + +from src.crawler import Crawler, crawl_site + + +class TestCrawler: + """Test cases for Crawler class.""" + + def test_crawler_initialization(self): + """Test crawler initializes with correct defaults.""" + crawler = Crawler() + assert crawler.user_agent == "TermScrape/1.0" + assert crawler.max_depth == 1 + assert crawler.respect_robots is True + assert crawler.same_domain_only is True + + @patch("src.crawler.Scraper.scrape") + @patch("src.crawler.Scraper._fetch_static") + def test_crawl_single_page(self, mock_fetch, mock_scrape): + """Test crawling a single page (depth 0).""" + mock_scrape.return_value = "# Test Page" + mock_fetch.return_value = "Test" + + crawler = Crawler(max_depth=0) + results = crawler.crawl("https://example.com") + + assert len(results) == 1 + assert "https://example.com" in results + assert results["https://example.com"]["depth"] == 0 + + @patch("src.crawler.Scraper.scrape") + @patch("src.crawler.Scraper._fetch_static") + def test_crawl_with_links(self, mock_fetch, mock_scrape): + """Test crawling with link extraction.""" + mock_scrape.return_value = "# Test" + + # First page has links + mock_fetch.side_effect = [ + """ + Link 2 + Link 3 + """, + "Page 2", + "Page 3", + ] + + crawler = Crawler(max_depth=1) + results = crawler.crawl("https://example.com/") + + # Should visit the start page plus linked pages + assert len(results) >= 1 + + def test_exclude_patterns(self): + """Test URL exclusion patterns.""" + crawler = Crawler(exclude_patterns=[".pdf", "/login"]) + + assert crawler._should_exclude("https://example.com/file.pdf") + assert crawler._should_exclude("https://example.com/login") + assert not crawler._should_exclude("https://example.com/page") + + @patch("src.crawler.Scraper.scrape") + @patch("src.crawler.Scraper._fetch_static") + def test_crawl_respects_depth(self, mock_fetch, mock_scrape): + """Test that crawler respects max depth.""" + mock_scrape.return_value = "# Test" + mock_fetch.return_value = "Test" + + crawler = Crawler(max_depth=0) + results = crawler.crawl("https://example.com") + + # With depth 0, should only visit start URL + assert len(results) == 1 + + @patch("src.crawler.Scraper.scrape") + def test_crawl_handles_errors(self, mock_scrape): + """Test crawler handles scraping errors gracefully.""" + from src.scraper import ScraperError + + mock_scrape.side_effect = ScraperError("Test error") + + crawler = Crawler(max_depth=0) + results = crawler.crawl("https://example.com") + + # Should record error instead of failing + assert "https://example.com" in results + assert "error" in results["https://example.com"] + + def test_same_domain_only(self): + """Test same_domain_only filtering.""" + crawler = Crawler(same_domain_only=True, max_depth=1) + + # This would be tested in integration, but we can verify the flag + assert crawler.same_domain_only is True + + +def test_crawl_site_convenience_function(): + """Test the crawl_site convenience function.""" + with patch("src.crawler.Crawler.crawl") as mock_crawl: + mock_crawl.return_value = {"https://example.com": {"content": "Test"}} + + result = crawl_site("https://example.com") + + assert "https://example.com" in result + mock_crawl.assert_called_once() diff --git a/termscrape/tests/test_scraper.py b/termscrape/tests/test_scraper.py new file mode 100644 index 0000000..8e81b1f --- /dev/null +++ b/termscrape/tests/test_scraper.py @@ -0,0 +1,106 @@ +"""Tests for scraper module.""" + +import pytest +from unittest.mock import Mock, patch + +from src.scraper import Scraper, ScraperError, scrape_url + + +class TestScraper: + """Test cases for Scraper class.""" + + def test_scraper_initialization(self): + """Test scraper initializes with correct defaults.""" + scraper = Scraper() + assert scraper.user_agent == "TermScrape/1.0" + assert scraper.timeout == 30 + assert scraper.respect_robots is True + + def test_invalid_url(self): + """Test that invalid URLs raise error.""" + scraper = Scraper() + with pytest.raises(ScraperError, match="Invalid URL"): + scraper.scrape("not-a-valid-url") + + @patch("src.scraper.check_robots_txt") + @patch("src.scraper.requests.Session.get") + def test_scrape_static_page(self, mock_get, mock_robots): + """Test scraping a static page.""" + # Mock robots.txt check + mock_robots.return_value = True + + # Mock HTTP response + mock_response = Mock() + mock_response.text = """ + + Test Page +

Hello World

+ + """ + mock_response.headers = {"Content-Type": "text/html"} + mock_response.raise_for_status = Mock() + mock_get.return_value = mock_response + + scraper = Scraper(use_delay=False) + result = scraper.scrape("https://example.com", output_format="text") + + assert "Hello World" in result + mock_robots.assert_called_once() + + @patch("src.scraper.check_robots_txt") + def test_robots_txt_disallowed(self, mock_robots): + """Test that robots.txt disallowed URLs are blocked.""" + mock_robots.return_value = False + + scraper = Scraper() + with pytest.raises(ScraperError, match="disallowed by robots.txt"): + scraper.scrape("https://example.com") + + @patch("src.scraper.check_robots_txt") + @patch("src.scraper.requests.Session.get") + def test_scrape_markdown_format(self, mock_get, mock_robots): + """Test markdown output format.""" + mock_robots.return_value = True + + mock_response = Mock() + mock_response.text = """ + + +

Heading

+

Paragraph text

+ + + """ + mock_response.headers = {"Content-Type": "text/html"} + mock_response.raise_for_status = Mock() + mock_get.return_value = mock_response + + scraper = Scraper(use_delay=False) + result = scraper.scrape( + "https://example.com", output_format="markdown" + ) + + assert "# Heading" in result or "Heading" in result + assert "Paragraph text" in result + + @patch("src.scraper.check_robots_txt") + @patch("src.scraper.requests.Session.get") + def test_request_timeout(self, mock_get, mock_robots): + """Test handling of request timeouts.""" + mock_robots.return_value = True + mock_get.side_effect = Exception("Timeout") + + scraper = Scraper(use_delay=False) + with pytest.raises(ScraperError): + scraper.scrape("https://example.com") + + +def test_scrape_url_convenience_function(): + """Test the scrape_url convenience function.""" + with patch("src.scraper.Scraper.scrape") as mock_scrape: + mock_scrape.return_value = "Test content" + + result = scrape_url("https://example.com") + + assert result == "Test content" + mock_scrape.assert_called_once()