-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinfo_scrape.py
More file actions
executable file
·77 lines (54 loc) · 2.21 KB
/
info_scrape.py
File metadata and controls
executable file
·77 lines (54 loc) · 2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 5 11:12:49 2022
@author: eliorland
"""
'''
Orignal script used to make the product info file. It remains unchanged
since its initial run and is here to show how the file was first created
'''
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
archive_url = 'https://www.taylorstitch.com/collections/mens-archive'
shirts_url = 'https://www.taylorstitch.com/collections/mens-shirts?sorted=best-selling-sales-count'
base_url = 'https://www.taylorstitch.com'
url_list = [archive_url, shirts_url]
product_urls = [] # list of all product pages to get later
for url in url_list:
site = requests.get(url)
soup = BeautifulSoup(site.text, 'html.parser')
products = soup.find_all('ul',{'class':'product matrix'})
products = products[0].find_all('a',href=True) # a tags hold products here
for product in products:
product_urls.append(base_url+product['href'])
# look at each product, pull relevent info.
# store all info in lists, which will be converted to pandas df later
product_title = []
product_description = []
product_material = []
for product in product_urls:
product_page = requests.get(product)
product_soup = BeautifulSoup(product_page.text, 'html.parser')
title_info = product_soup.find('h1')['data-title']
description_info = product_soup.find_all('div',
{'id':'collapsible-description'})
material_info = product_soup.find_all('div',
{'id':'collapsible-material'})
#if len(description_info)==0:
# continue
try:
description = description_info[0].find('p').text
material = material_info[0].find('p').text
except:
continue
product_title.append(title_info)
product_description.append(description)
product_material.append(material)
all_info = pd.DataFrame(list(zip(product_title,
product_description,
product_material)),
columns =['Name', 'Description', 'Material'])
all_info.to_csv('taylor_stitch_info.csv',index=True,header=True)