Skip to content

Commit 5b8d0a8

Browse files
authored
Merge pull request #2 from MusinsaWagon/develop
Refactor: 가격 크롤링 정규표현식 적용
2 parents 9af0b35 + eb7f11a commit 5b8d0a8

9 files changed

Lines changed: 331 additions & 42 deletions

File tree

etc/zigzag_products.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
129826097
-291 Bytes
Binary file not shown.
541 Bytes
Binary file not shown.

models/product.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,11 @@
33
import logging
44
from config.mysql import Base
55
from config.mysql import Session
6+
from models.shop_type import ShopType
67
from models.category import get_or_create_category
78
from models.product_detail import create_product_detail, find_product_detail_by_id, update_product_detail_info
89
from models.product_history import create_product_history, count_product_history_by_product_id
910
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
10-
from enum import Enum
11-
12-
class ShopType(Enum):
13-
MUSINSA = "MUSINSA"
14-
ZIGZAG = "ZIGZAG"
15-
ABLY = "ABLY"
16-
BRANDY = "BRANDY"
1711

1812
class Product(Base):
1913
__tablename__ = 'product'
@@ -35,7 +29,7 @@ class Product(Base):
3529
updated_at = Column(Date, default=datetime.date.today, onupdate=datetime.date.today)
3630

3731
# 상품 생성
38-
def create_product(product):
32+
def create_product(product, shop_type):
3933
try:
4034
category_id = get_or_create_category(product['category'], product['parent_category'])
4135
product_num = int(product['product_num'])
@@ -47,7 +41,7 @@ def create_product(product):
4741
star_score = float(product.get('star_score', 0.0))
4842
review_count = int(product.get('review_count', 0))
4943
like_count = int(product.get('like_count', 0))
50-
shop_type = ShopType.MUSINSA
44+
shop_type = shop_type
5145

5246
# Product 객체 생성
5347
new_product = Product(
@@ -75,13 +69,13 @@ def create_product(product):
7569

7670

7771
# 상품 저장
78-
def save_product_info(products_info):
72+
def save_product_info(products_info, shop_type):
7973
session = Session()
8074
try:
8175
for product in products_info:
8276
try:
8377
with session.begin():
84-
new_product = create_product(product)
78+
new_product = create_product(product, shop_type)
8579
if new_product is None:
8680
continue # 생성 실패한 경우 다음으로 넘어감
8781

models/shop_type.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from enum import Enum
2+
class ShopType(Enum):
3+
MUSINSA = "MUSINSA"
4+
ZIGZAG = "ZIGZAG"
5+
ABLY = "ABLY"
6+
BRANDY = "BRANDY"

musinsa/product_day_price.py

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from config.file import read_product_numbers
1515
from models.product import update_product_and_history_and_detail_info, get_all_product_numbers
1616
import random
17+
import re
1718

1819
load_dotenv() # 환경변수 로딩
1920

@@ -39,19 +40,14 @@ def extract_musinsa_current_price(product_num, headers):
3940
if script_tag:
4041
# script 내용 중 필요한 부분만 추출
4142
script_content = script_tag.string.strip()
42-
43-
# JSON 객체만 추출
44-
json_start = script_content.find('{"goodsNo":')
45-
json_end = script_content.rfind('}') + 1
46-
47-
# JSON 데이터가 올바르게 추출되었는지 확인
48-
if json_start != -1 and json_end != -1:
49-
json_data = json.loads(script_content[json_start:json_end])
50-
51-
# 판매가 추출
43+
# 정규식으로 JSON 추출
44+
match = re.search(r'window\.__MSS__\.product\.state\s*=\s*({.*?});', script_content)
45+
if match:
46+
json_str = match.group(1) # 첫 번째 그룹이 JSON 문자열
47+
json_data = json.loads(json_str)
5248
current_price = json_data.get('goodsPrice', {}).get('salePrice', 'N/A')
5349
return current_price
54-
50+
5551
else:
5652
logging.warning(f'JSON 데이터를 추출할 수 없습니다. 상품 번호: {product_num}')
5753
return None
@@ -73,7 +69,6 @@ def process_products(products_num):
7369
time.sleep(random.uniform(1, 3)) # 1초에서 3초 사이의 랜덤 딜레이
7470

7571
price = extract_musinsa_current_price(product_id, headers)
76-
7772
if price:
7873
successful_products.append(f'상품 번호: {product_id}, 가격: {price}원')
7974
update_product_and_history_and_detail_info(price, product_id, "MUSINSA")

musinsa/product_info_crawling.py

Lines changed: 17 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,13 @@
1515
from config.mysql import *
1616
from models.product import save_product_info
1717
from config.file import read_product_numbers
18+
from models.shop_type import ShopType
19+
1820

1921
# 무신사 상품 기본 URL
2022
USER_AGENT = os.getenv("USER_AGENT")
2123
MUSINSA_PRODUCT_URL = os.getenv("MUSINSA_PRODUCT_URL")
2224
PRODUCTS_FILE_PATH = os.getenv("PRODUCTS_FILE_PATH")
23-
ADD_PROUDCTS_LIST_FILE_PATH = os.getenv("ADD_PROUDCTS_LIST_FILE_PATH")
2425

2526
load_dotenv() # 환경변수 로딩
2627

@@ -31,21 +32,6 @@ def extract_json_from_script(script_content):
3132
return json.loads(script_content[json_start:json_end])
3233
return None
3334

34-
def extract_product_info(json_data, product_num, product_url):
35-
return {
36-
'name': json_data.get('goodsNm', 'N/A'),
37-
'brand': json_data.get('brandInfo', {}).get('brandName', 'N/A'),
38-
'parent_category': json_data.get('category', {}).get('categoryDepth1Title', 'N/A'),
39-
'category': json_data.get('category', {}).get('categoryDepth2Title', 'N/A'),
40-
'product_num': product_num,
41-
'current_price': json_data.get('goodsPrice', {}).get('salePrice', 'N/A'),
42-
'image_url': json_data.get('thumbnailImageUrl', 'N/A'),
43-
'star_score': json_data.get('goodsReview', {}).get('satisfactionScore', 'N/A'),
44-
'review_count': json_data.get('goodsReview', {}).get('totalCount', 'N/A'),
45-
'product_url': product_url,
46-
'brand_logo_url': json_data.get('brandInfo', {}).get('brandLogoImage', 'N/A'),
47-
'like_count': 0, # 현재 like_count는 가상 데이터
48-
}
4935

5036
def extract_musinsa_product_main_info(product_num, session, headers):
5137
product_url = f'{MUSINSA_PRODUCT_URL}/{product_num}'
@@ -65,9 +51,21 @@ def extract_musinsa_product_main_info(product_num, session, headers):
6551
if not json_data:
6652
logging.warning(f'JSON 데이터를 추출할 수 없습니다. 상품 번호: {product_num}')
6753
return None
68-
print(json_data)
6954

70-
return extract_product_info(json_data, product_num, product_url)
55+
return {
56+
'name': json_data.get('goodsNm', 'N/A'),
57+
'brand': json_data.get('brandInfo', {}).get('brandName', 'N/A'),
58+
'parent_category': json_data.get('category', {}).get('categoryDepth1Title', 'N/A'),
59+
'category': json_data.get('category', {}).get('categoryDepth2Title', 'N/A'),
60+
'product_num': product_num,
61+
'current_price': json_data.get('goodsPrice', {}).get('salePrice', 'N/A'),
62+
'image_url': json_data.get('thumbnailImageUrl', 'N/A'),
63+
'star_score': json_data.get('goodsReview', {}).get('satisfactionScore', 'N/A'),
64+
'review_count': json_data.get('goodsReview', {}).get('totalCount', 'N/A'),
65+
'product_url': product_url,
66+
'brand_logo_url': json_data.get('brandInfo', {}).get('brandLogoImage', 'N/A'),
67+
'like_count': 0, # 현재 like_count는 가상 데이터
68+
}
7169

7270
except requests.RequestException as e:
7371
logging.error(f'페이지를 불러오지 못했습니다. 상품 번호: {product_num}, 오류: {e}')
@@ -116,7 +114,7 @@ def get_musinsa_product_info():
116114
print_product_main_data(products_info)
117115

118116
# DB에 저장
119-
save_product_info(products_info)
117+
save_product_info(products_info, ShopType.MUSINSA)
120118

121119
if __name__ == "__main__":
122120
get_musinsa_product_info()

zigzag/product_info_crawling.py

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import sys
2+
import os
3+
import requests
4+
import json
5+
import time
6+
from bs4 import BeautifulSoup
7+
import os
8+
from concurrent.futures import ThreadPoolExecutor
9+
from multiprocessing import Pool, cpu_count
10+
from dotenv import load_dotenv
11+
import logging
12+
13+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14+
from config.log import *
15+
from config.mysql import *
16+
from models.product import save_product_info
17+
from config.file import read_product_numbers
18+
from models.shop_type import ShopType
19+
20+
# 지그재그 상품 기본 URL
21+
USER_AGENT = os.getenv("USER_AGENT")
22+
ZIGZAG_PRODUCT_URL = os.getenv("ZIGZAG_PRODUCT_URL")
23+
ZIGZAG_PRODUCTS_FILE_PATH = os.getenv("ZIGZAG_PRODUCTS_FILE_PATH")
24+
25+
load_dotenv() # 환경변수 로딩
26+
27+
# 리뷰 수 & 평점
28+
def get_review_summary(catalog_product_id, headers, limit_count=5, order="BEST_SCORE_DESC"):
29+
30+
# catalog_product_id (str): 상품 ID
31+
# limit_count (int): 가져올 리뷰 제한 수 (기본값: 5)
32+
# order (str): 리뷰 정렬 기준 (기본값: "BEST_SCORE_DESC")
33+
34+
url = "https://api.zigzag.kr/api/2/graphql/GetPdpIntegratedData"
35+
36+
payload = {
37+
"operationName": "GetPdpIntegratedData",
38+
"variables": {
39+
"catalog_product_id": catalog_product_id,
40+
"limit_count": limit_count,
41+
"order": order
42+
},
43+
"query": """
44+
query GetPdpIntegratedData(
45+
$catalog_product_id: ID!,
46+
$limit_count: Int,
47+
$order: ProductReviewListOrderType
48+
) {
49+
related_product_review_summary(product_id: $catalog_product_id) {
50+
all_count
51+
ratings_average
52+
}
53+
product_review_list(
54+
product_id: $catalog_product_id,
55+
limit_count: $limit_count,
56+
order: $order
57+
) {
58+
item_list {
59+
id
60+
contents
61+
rating
62+
reviewer {
63+
profile {
64+
masked_email
65+
}
66+
}
67+
}
68+
}
69+
}
70+
"""
71+
}
72+
73+
try:
74+
response = requests.post(url, json=payload, headers=headers)
75+
response.raise_for_status()
76+
77+
data = response.json()
78+
79+
review_summary = data.get("data", {}).get("related_product_review_summary", {})
80+
review_count = review_summary.get("all_count", 0)
81+
star_score = review_summary.get("ratings_average", 0.0)
82+
83+
return {
84+
"review_count": review_count,
85+
"star_score": star_score
86+
}
87+
except requests.exceptions.RequestException as e:
88+
logging.error(f"HTTP 리뷰 수 요청 에러: {e}")
89+
except Exception as e:
90+
logging.error(f"리뷰 수 요청 오류 발생: {e}")
91+
return None
92+
93+
94+
95+
def extract_product_info(json_data, product_num, product_url):
96+
return {
97+
'name': json_data.get('goodsNm', 'N/A'),
98+
'brand': json_data.get('brandInfo', {}).get('brandName', 'N/A'),
99+
'parent_category': json_data.get('category', {}).get('categoryDepth1Title', 'N/A'),
100+
'category': json_data.get('category', {}).get('categoryDepth2Title', 'N/A'),
101+
'product_num': product_num,
102+
'current_price': json_data.get('goodsPrice', {}).get('salePrice', 'N/A'),
103+
'image_url': json_data.get('thumbnailImageUrl', 'N/A'),
104+
'star_score': json_data.get('goodsReview', {}).get('satisfactionScore', 'N/A'),
105+
'review_count': json_data.get('goodsReview', {}).get('totalCount', 'N/A'),
106+
'product_url': product_url,
107+
'brand_logo_url': json_data.get('brandInfo', {}).get('brandLogoImage', 'N/A'),
108+
'like_count': 0, # 현재 like_count는 가상 데이터
109+
}
110+
111+
def parsing_product_to_json_data(product_num, session, headers, product_url):
112+
response = session.get(product_url, headers=headers, timeout=5)
113+
response.raise_for_status()
114+
soup = BeautifulSoup(response.content, 'lxml')
115+
116+
117+
script_data = soup.find('script', {'id': '__NEXT_DATA__'})
118+
if not script_data:
119+
logging.warning(f'상품 정보를 찾을 수 없습니다. 상품 번호: {product_num}')
120+
return None
121+
122+
json_data = json.loads(script_data.string)
123+
return json_data
124+
125+
def extract_zigzag_product_main_info(product_num, session, headers):
126+
product_url = f'{ZIGZAG_PRODUCT_URL}/{product_num}'
127+
128+
try:
129+
json_data = parsing_product_to_json_data(product_num, session, headers, product_url)
130+
131+
product_data = json_data.get('props', {}).get('pageProps', {}).get('product', {})
132+
brand_data = json_data.get('props', {}).get('pageProps', {}).get('shop', {})
133+
review_summary = get_review_summary(product_num, headers)
134+
135+
# 카테고리 정보 추출
136+
managed_categories = product_data.get('managed_category_list', [])
137+
category_datas = [cat['value'] for cat in managed_categories]
138+
139+
if not product_data:
140+
logging.warning(f'상품 데이터를 찾을 수 없습다. 상품 번호: {product_num}')
141+
return None
142+
143+
return {
144+
'name': product_data.get('name', 'N/A'),
145+
'brand': brand_data.get('name','N/A'),
146+
'parent_category': category_datas[2],
147+
'category': category_datas[3],
148+
'product_num': product_num,
149+
'product_url' : product_url,
150+
'current_price': product_data.get('product_price', {}).get('final_discount_info', {}).get('discount_price', 'N/A'),
151+
'image_url': product_data.get('product_image_list', [{}])[0].get('url', 'N/A'),
152+
'star_score': review_summary['star_score'],
153+
'review_count': review_summary['review_count'],
154+
'brand_logo_url':brand_data.get('typical_image_url', 'N/A'),
155+
'like_count': 0,
156+
}
157+
158+
except json.JSONDecodeError as e:
159+
logging.error(f'JSON 파싱 오류: {product_num}, 오류: {e}')
160+
return None
161+
162+
def fetch_product_info_multithread(products_num, headers):
163+
products_info = []
164+
with requests.Session() as session:
165+
with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
166+
futures = [executor.submit(extract_zigzag_product_main_info, product_num, session, headers) for product_num in products_num]
167+
for future in futures:
168+
product_info = future.result()
169+
if product_info:
170+
products_info.append(product_info)
171+
return products_info
172+
173+
def print_product_main_data(products_info):
174+
for product_info in products_info:
175+
print(f'상품 번호: {product_info["product_num"]}')
176+
print(f'상품 이름: {product_info["name"]}')
177+
print(f'브랜드: {product_info["brand"]}')
178+
print(f'상위 카테고리: {product_info["parent_category"]}')
179+
print(f'카테고리: {product_info["category"]}')
180+
print(f'상품 판매가: {product_info["current_price"]}')
181+
print(f'상품 URL: {product_info["product_url"]}')
182+
print(f'상품 이미지 URL: {product_info["image_url"]}')
183+
print(f'좋아요 수: {product_info["like_count"]}')
184+
print(f'별점: {product_info["star_score"]}')
185+
print(f'리뷰 수: {product_info["review_count"]}')
186+
print(f'로고 URL: {product_info["brand_logo_url"]}')
187+
print("---------------------------------------")
188+
189+
def get_zigzag_product_info():
190+
products_num = read_product_numbers(ZIGZAG_PRODUCTS_FILE_PATH)
191+
192+
headers = {
193+
'User-Agent': USER_AGENT,
194+
"Connection": "close"
195+
}
196+
197+
start_time = time.time()
198+
products_info = fetch_product_info_multithread(products_num, headers)
199+
end_time = time.time()
200+
201+
logging.info(f'총 실행 시간: {end_time - start_time:.2f}초')
202+
print_product_main_data(products_info)
203+
204+
# DB에 저장
205+
save_product_info(products_info, ShopType.ZIGZAG)
206+
207+
208+
if __name__ == "__main__":
209+
get_zigzag_product_info()
210+

0 commit comments

Comments
 (0)