@@ -263,6 +263,55 @@ def run(self):
263263 print ("[경고] 수집 중 오류 발생 - 종료" )
264264 return None
265265
266+ print ("\n ===================================\n " )
267+ print ("[크롤링 완료]" )
268+ print (f"총 수집된 기사 수: { len (data )} " )
269+ print ("수집 실패 카테고리:" , self .cat_list )
270+ end = time .time ()
271+ print (f"실행 시간: { end - start :.4f} 초" )
272+ print ("\n ===================================\n " )
273+
274+
275+ counts = {}
276+
277+ for d in data :
278+ if d ["subcategory" ].strip ():
279+ counts [d ["subcategory" ]] = counts .get (d ["subcategory" ], 0 ) + 1
280+ elif d ["category" ].strip ():
281+ counts [d ["category" ]] = counts .get (d ["category" ], 0 ) + 1
282+
283+ for k in counts :
284+ counts [k ] = counts [k ] // 10
285+
286+ new_data = []
287+ cat_count = {}
288+
289+ for d in data :
290+ cat = d ["category" ].strip ()
291+ sub = d ["subcategory" ].strip ()
292+ key = sub if sub else cat
293+ if not key :
294+ continue
295+ if counts .get (key , 0 ) > 0 or cat_count .get (cat , 0 ) < 20 :
296+ new_data .append (d )
297+ counts [key ] = counts .get (key , 0 ) - 1
298+ cat_count [cat ] = cat_count .get (cat , 0 ) + 1
299+
300+ filtered_category = {}
301+ filtered_subcategory = {}
302+
303+ for d in new_data :
304+ cat = d ["category" ].strip ()
305+ filtered_category [cat ] = filtered_category .get (cat , 0 ) + 1
306+
307+ print ("\n ===================================\n " )
308+ print ("[필터링 완료]" )
309+ print (f"총 기사 수 (필터링 후): { len (new_data )} " )
310+ print ("필터링 후 카테고리별 기사 수:" )
311+ for cat , cnt in sorted (filtered_category .items (), key = lambda x : (- x [1 ], x [0 ])):
312+ print (f"{ cat :20} { cnt } " )
313+ print ("\n ===================================\n " )
314+
266315 try :
267316 with tempfile .NamedTemporaryFile ("w" , delete = False , encoding = "utf-8" ) as tmp :
268317 for item in data :
@@ -275,12 +324,8 @@ def run(self):
275324 os .remove (tmp_path )
276325
277326 print ("\n ===================================\n " )
278- print ("[완료] 크롤링 완료" )
279- print (f"총 수집된 기사 수: { len (data )} " )
280- print (f"결과가 { self .NEWS_URLS_JSONL } 파일에 저장되었습니다." )
281- print ("수집 실패 카테고리:" , self .cat_list )
282- end = time .time ()
283- print (f"실행 시간: { end - start :.4f} 초" )
327+ print ("[저장 완료]" )
328+ print (f"결과가 { self .NEWS_URLS_JSONL } 파일에 저장됨." )
284329 print ("\n ===================================\n " )
285330
286331
0 commit comments