Skip to content

Commit a23b801

Browse files
author
TTOAI
committed
기사 개수 크롭 코드 추가
1 parent 74b9d66 commit a23b801

2 files changed

Lines changed: 53 additions & 6 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,5 @@ __pycache__/
1414

1515
# Storage files
1616
*.zip
17+
18+
*/test.py

playwright_news_crawler/daum_news_crawler.py

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,55 @@ def run(self):
263263
print("[경고] 수집 중 오류 발생 - 종료")
264264
return None
265265

266+
print("\n===================================\n")
267+
print("[크롤링 완료]")
268+
print(f"총 수집된 기사 수: {len(data)}")
269+
print("수집 실패 카테고리:", self.cat_list)
270+
end = time.time()
271+
print(f"실행 시간: {end - start:.4f}초")
272+
print("\n===================================\n")
273+
274+
275+
counts = {}
276+
277+
for d in data:
278+
if d["subcategory"].strip():
279+
counts[d["subcategory"]] = counts.get(d["subcategory"], 0) + 1
280+
elif d["category"].strip():
281+
counts[d["category"]] = counts.get(d["category"], 0) + 1
282+
283+
for k in counts:
284+
counts[k] = counts[k] // 10
285+
286+
new_data = []
287+
cat_count = {}
288+
289+
for d in data:
290+
cat = d["category"].strip()
291+
sub = d["subcategory"].strip()
292+
key = sub if sub else cat
293+
if not key:
294+
continue
295+
if counts.get(key, 0) > 0 or cat_count.get(cat, 0) < 20:
296+
new_data.append(d)
297+
counts[key] = counts.get(key, 0) - 1
298+
cat_count[cat] = cat_count.get(cat, 0) + 1
299+
300+
filtered_category = {}
301+
filtered_subcategory = {}
302+
303+
for d in new_data:
304+
cat = d["category"].strip()
305+
filtered_category[cat] = filtered_category.get(cat, 0) + 1
306+
307+
print("\n===================================\n")
308+
print("[필터링 완료]")
309+
print(f"총 기사 수 (필터링 후): {len(new_data)}")
310+
print("필터링 후 카테고리별 기사 수:")
311+
for cat, cnt in sorted(filtered_category.items(), key=lambda x: (-x[1], x[0])):
312+
print(f"{cat:20} {cnt}")
313+
print("\n===================================\n")
314+
266315
try:
267316
with tempfile.NamedTemporaryFile("w", delete=False, encoding="utf-8") as tmp:
268317
for item in data:
@@ -275,12 +324,8 @@ def run(self):
275324
os.remove(tmp_path)
276325

277326
print("\n===================================\n")
278-
print("[완료] 크롤링 완료")
279-
print(f"총 수집된 기사 수: {len(data)}")
280-
print(f"결과가 {self.NEWS_URLS_JSONL} 파일에 저장되었습니다.")
281-
print("수집 실패 카테고리:", self.cat_list)
282-
end = time.time()
283-
print(f"실행 시간: {end - start:.4f}초")
327+
print("[저장 완료]")
328+
print(f"결과가 {self.NEWS_URLS_JSONL} 파일에 저장됨.")
284329
print("\n===================================\n")
285330

286331

0 commit comments

Comments
 (0)