-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
83 lines (66 loc) · 2.51 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from selenium import webdriver
from bs4 import BeautifulSoup
import re
# Define a regex pattern for detecting email addresses
EMAIL_REGEX = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
def scrape_buttons_in_website(url):
# Set up Chrome options for headless mode
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# Initialize the Chrome WebDriver
service = webdriver.ChromeService()
driver = webdriver.Chrome(service=service, options=options)
try:
# Fetch the website
driver.get(url)
# Extract the page source and pass to BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
finally:
# Close the browser
driver.quit()
# Debugging: Check if you're getting proper content
print(soup.prettify()) # Print the HTML content
# Find all <a> tags
links = soup.find_all('a', href=True)
matches = []
# Extract the href values and construct valid URLs
for link in links:
href = link['href'].strip()
# Handle relative URLs
if href.startswith('/'):
final_url = f'{url.rstrip("/")}{href}'
# Handle absolute URLs
elif href.startswith('http'):
final_url = href
else:
continue # Skip other formats (like mailto, javascript links, etc.)
matches.append(final_url)
return matches
def scrape_email_from_website(url):
matches = scrape_buttons_in_website(url)
emails = set()
# Set up Chrome options for headless mode
options = webdriver.ChromeOptions()
options.add_argument('--headless')
# Initialize the Chrome WebDriver
service = webdriver.ChromeService()
# Iterate through the links and scrape emails
for link in matches:
try:
driver = webdriver.Chrome(service=service, options=options)
# Fetch the website
driver.get(link)
# Extract the page source and pass to BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
email_pattern = re.compile(EMAIL_REGEX)
emails.update(set(re.findall(email_pattern, soup.get_text())))
except Exception as e:
print(f'Error scraping {link}: {e}')
finally:
driver.quit()
# Debugging: Check if you're getting proper content
print(soup.prettify()) # Print the HTML content
return list(emails)
url = 'https://articture.com/en-gb'
result = scrape_email_from_website(url)
print(result)