web-scraper/main.py at main · spectrayan/web-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import sys
import argparse
from pathlib import Path
from typing import Optional

from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from loguru import logger

# Add the src directory to the Python path
sys.path.insert(0, str(Path(__file__).parent))

from src.scraper.config import get_config, ScraperConfig
from src.scraper.csv_reader import CSVReader
from src.scraper.logger import setup_logger, get_default_log_file
from src.scraper.spiders.web_spider import WebSpider


def parse_arguments():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Web Scraper")
    parser.add_argument(
        "--csv-file", "-c",
        type=str,
        help="Path to the CSV file containing URLs to scrape",
        #required=True
    )
    parser.add_argument(
        "--output-dir", "-o",
        type=str,
        default="output",
        help="Directory to save scraped data"
    )
    parser.add_argument(
        "--log-level", "-l",
        type=str,
        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
        default="INFO",
        help="Log level"
    )
    parser.add_argument(
        "--log-file",
        type=str,
        default=None,
        help="Path to the log file. If not provided, a default log file will be created."
    )

    # Crawling options
    parser.add_argument(
        "--follow-internal-links",
        action="store_true",
        default=None,
        help="Follow internal links within the same domain"
    )
    parser.add_argument(
        "--no-follow-internal-links",
        action="store_false",
        dest="follow_internal_links",
        help="Do not follow internal links"
    )
    parser.add_argument(
        "--max-depth",
        type=int,
        default=None,
        help="Maximum depth for crawling internal links"
    )
    parser.add_argument(
        "--max-pages-per-domain",
        type=int,
        default=None,
        help="Maximum number of pages to crawl per domain"
    )

    # Selenium options
    parser.add_argument(
        "--selenium-wait-time",
        type=int,
        default=None,
        help="Maximum time to wait for a page to load (in seconds)"
    )

    return parser.parse_args()


def main():
    """Run the web scraper."""
    # Parse command line arguments
    args = parse_arguments()

    # Set up logging
    log_file = args.log_file or get_default_log_file()
    setup_logger(log_level=args.log_level, log_file=log_file)

    try:
        # Get configuration
        config = get_config(csv_file_path=args.csv_file)

        # Override configuration with command line arguments
        config.output_dir = args.output_dir

        # Override crawling configuration if provided
        if args.follow_internal_links is not None:
            config.follow_internal_links = args.follow_internal_links

        if args.max_depth is not None:
            config.max_depth = args.max_depth

        if args.max_pages_per_domain is not None:
            config.max_pages_per_domain = args.max_pages_per_domain

        # Override Selenium configuration if provided
        if args.selenium_wait_time is not None:
            config.selenium_wait_time = args.selenium_wait_time

        logger.info(f"Starting web scraper with configuration: {config}")

        # Read URLs from CSV file
        csv_reader = CSVReader(config.csv_file_path)
        urls = csv_reader.read_urls()

        if not urls:
            logger.error("No URLs found in the CSV file")
            return

        logger.info(f"Found {len(urls)} URLs to scrape")

        # Get Scrapy settings
        settings = get_project_settings()

        # Update settings from config
        settings.update({
            'BOT_NAME': config.bot_name,
            'SPIDER_MODULES': config.spider_modules,
            'NEWSPIDER_MODULE': config.newspider_module,
            'ROBOTSTXT_OBEY': config.robotstxt_obey,
            'USER_AGENT': config.user_agent,
            'CONCURRENT_REQUESTS': config.concurrent_requests,
            'DOWNLOAD_DELAY': config.download_delay,
            'CONCURRENT_REQUESTS_PER_DOMAIN': config.concurrent_requests_per_domain,
            'ITEM_PIPELINES': config.item_pipelines,
            'OUTPUT_DIR': config.output_dir,
            'SELENIUM_WAIT_TIME': config.selenium_wait_time,  # Add Selenium wait time
        })

        # Create the crawler process
        process = CrawlerProcess(settings)

        # Start the spider with crawling configuration
        process.crawl(
            WebSpider,
            urls=urls,
            follow_internal_links=config.follow_internal_links,
            max_depth=config.max_depth,
            max_pages_per_domain=config.max_pages_per_domain
        )

        # Start the crawling process
        logger.info("Starting the crawling process")
        process.start()

        logger.info("Crawling process completed")

    except Exception as e:
        logger.exception(f"Error running web scraper: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()