-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
170 lines (140 loc) · 4.99 KB
/
main.py
File metadata and controls
170 lines (140 loc) · 4.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import os
import sys
import argparse
from pathlib import Path
from typing import Optional
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from loguru import logger
# Add the src directory to the Python path
sys.path.insert(0, str(Path(__file__).parent))
from src.scraper.config import get_config, ScraperConfig
from src.scraper.csv_reader import CSVReader
from src.scraper.logger import setup_logger, get_default_log_file
from src.scraper.spiders.web_spider import WebSpider
def parse_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Web Scraper")
parser.add_argument(
"--csv-file", "-c",
type=str,
help="Path to the CSV file containing URLs to scrape",
#required=True
)
parser.add_argument(
"--output-dir", "-o",
type=str,
default="output",
help="Directory to save scraped data"
)
parser.add_argument(
"--log-level", "-l",
type=str,
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
default="INFO",
help="Log level"
)
parser.add_argument(
"--log-file",
type=str,
default=None,
help="Path to the log file. If not provided, a default log file will be created."
)
# Crawling options
parser.add_argument(
"--follow-internal-links",
action="store_true",
default=None,
help="Follow internal links within the same domain"
)
parser.add_argument(
"--no-follow-internal-links",
action="store_false",
dest="follow_internal_links",
help="Do not follow internal links"
)
parser.add_argument(
"--max-depth",
type=int,
default=None,
help="Maximum depth for crawling internal links"
)
parser.add_argument(
"--max-pages-per-domain",
type=int,
default=None,
help="Maximum number of pages to crawl per domain"
)
# Selenium options
parser.add_argument(
"--selenium-wait-time",
type=int,
default=None,
help="Maximum time to wait for a page to load (in seconds)"
)
return parser.parse_args()
def main():
"""Run the web scraper."""
# Parse command line arguments
args = parse_arguments()
# Set up logging
log_file = args.log_file or get_default_log_file()
setup_logger(log_level=args.log_level, log_file=log_file)
try:
# Get configuration
config = get_config(csv_file_path=args.csv_file)
# Override configuration with command line arguments
config.output_dir = args.output_dir
# Override crawling configuration if provided
if args.follow_internal_links is not None:
config.follow_internal_links = args.follow_internal_links
if args.max_depth is not None:
config.max_depth = args.max_depth
if args.max_pages_per_domain is not None:
config.max_pages_per_domain = args.max_pages_per_domain
# Override Selenium configuration if provided
if args.selenium_wait_time is not None:
config.selenium_wait_time = args.selenium_wait_time
logger.info(f"Starting web scraper with configuration: {config}")
# Read URLs from CSV file
csv_reader = CSVReader(config.csv_file_path)
urls = csv_reader.read_urls()
if not urls:
logger.error("No URLs found in the CSV file")
return
logger.info(f"Found {len(urls)} URLs to scrape")
# Get Scrapy settings
settings = get_project_settings()
# Update settings from config
settings.update({
'BOT_NAME': config.bot_name,
'SPIDER_MODULES': config.spider_modules,
'NEWSPIDER_MODULE': config.newspider_module,
'ROBOTSTXT_OBEY': config.robotstxt_obey,
'USER_AGENT': config.user_agent,
'CONCURRENT_REQUESTS': config.concurrent_requests,
'DOWNLOAD_DELAY': config.download_delay,
'CONCURRENT_REQUESTS_PER_DOMAIN': config.concurrent_requests_per_domain,
'ITEM_PIPELINES': config.item_pipelines,
'OUTPUT_DIR': config.output_dir,
'SELENIUM_WAIT_TIME': config.selenium_wait_time, # Add Selenium wait time
})
# Create the crawler process
process = CrawlerProcess(settings)
# Start the spider with crawling configuration
process.crawl(
WebSpider,
urls=urls,
follow_internal_links=config.follow_internal_links,
max_depth=config.max_depth,
max_pages_per_domain=config.max_pages_per_domain
)
# Start the crawling process
logger.info("Starting the crawling process")
process.start()
logger.info("Crawling process completed")
except Exception as e:
logger.exception(f"Error running web scraper: {e}")
sys.exit(1)
if __name__ == "__main__":
main()