From e5cb820ab5ca27599c2020f34c8e9c01e532b2ef Mon Sep 17 00:00:00 2001 From: wonjunYou Date: Wed, 13 Nov 2024 14:27:43 +0900 Subject: [PATCH] =?UTF-8?q?[IDLE-000]=20=ED=81=AC=EB=A1=A4=EB=A7=81=20?= =?UTF-8?q?=EC=A4=91=EC=97=90=20=EC=97=90=EB=9F=AC=EA=B0=80=20=EB=B0=9C?= =?UTF-8?q?=EC=83=9D=ED=95=98=EB=8D=94=EB=9D=BC=EB=8F=84,=20=ED=81=AC?= =?UTF-8?q?=EB=A1=A4=EB=A7=81=EB=90=9C=20=EA=B0=92=EC=9D=80=20=EB=AC=B4?= =?UTF-8?q?=EC=A1=B0=EA=B1=B4=20=EB=B0=98=ED=99=98=ED=95=98=EB=8F=84?= =?UTF-8?q?=EB=A1=9D=20=EB=B3=80=EA=B2=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../common/scheduler/CrawlingJobScheduler.kt | 2 +- .../com/swm/idle/batch/util/WorknetCrawler.kt | 105 +++++++++--------- 2 files changed, 55 insertions(+), 52 deletions(-) diff --git a/idle-batch/src/main/kotlin/com/swm/idle/batch/common/scheduler/CrawlingJobScheduler.kt b/idle-batch/src/main/kotlin/com/swm/idle/batch/common/scheduler/CrawlingJobScheduler.kt index fcd15bb4..3f3a66a0 100644 --- a/idle-batch/src/main/kotlin/com/swm/idle/batch/common/scheduler/CrawlingJobScheduler.kt +++ b/idle-batch/src/main/kotlin/com/swm/idle/batch/common/scheduler/CrawlingJobScheduler.kt @@ -13,7 +13,7 @@ class CrawlingJobScheduler( private val crawlingJobConfig: CrawlingJobConfig, ) { - @Scheduled(cron = "0 25 10 * * *") + @Scheduled(cron = "0 40 14 * * *") fun scheduleJob() { val jobParameters: JobParameters = JobParametersBuilder() .addLong("timestamp", System.currentTimeMillis()) diff --git a/idle-batch/src/main/kotlin/com/swm/idle/batch/util/WorknetCrawler.kt b/idle-batch/src/main/kotlin/com/swm/idle/batch/util/WorknetCrawler.kt index 9b967338..ccc1a961 100644 --- a/idle-batch/src/main/kotlin/com/swm/idle/batch/util/WorknetCrawler.kt +++ b/idle-batch/src/main/kotlin/com/swm/idle/batch/util/WorknetCrawler.kt @@ -21,7 +21,7 @@ object WorknetCrawler { private val logger = KotlinLogging.logger { } private const val CRAWLING_TARGET_URL_FORMAT = - "https://www.work24.go.kr/wk/a/b/1200/retriveDtlEmpSrchList.do?basicSetupYn=&careerTo=&keywordJobCd=&occupation=&seqNo=&cloDateEndtParam=&payGbn=&templateInfo=&rot2WorkYn=&shsyWorkSecd=&srcKeywordParam=%EC%9A%94%EC%96%91+%EB%B3%B4%ED%98%B8%EC%82%AC&resultCnt=50&keywordJobCont=&cert=&moreButtonYn=Y&minPay=&codeDepth2Info=11000¤tPageNo=1&eventNo=&mode=&major=&resrDutyExcYn=&eodwYn=&sortField=DATE&staArea=&sortOrderBy=DESC&keyword=%EC%9A%94%EC%96%91+%EB%B3%B4%ED%98%B8%EC%82%AC&termSearchGbn=&carrEssYns=&benefitSrchAndOr=O&disableEmpHopeGbn=&actServExcYn=&keywordStaAreaNm=&maxPay=&emailApplyYn=&codeDepth1Info=11000&keywordEtcYn=®DateStdtParam=20241104&publDutyExcYn=&keywordJobCdSeqNo=&viewType=&exJobsCd=&templateDepthNmInfo=®ion=&employGbn=&empTpGbcd=1&computerPreferential=&infaYn=&cloDateStdtParam=&siteClcd=WORK&searchMode=Y&birthFromYY=&indArea=&careerTypes=&subEmpHopeYn=&tlmgYn=&academicGbn=&templateDepthNoInfo=&foriegn=&entryRoute=&mealOfferClcd=&basicSetupYnChk=&station=&holidayGbn=&srcKeyword=%EC%9A%94%EC%96%91+%EB%B3%B4%ED%98%B8%EC%82%AC&academicGbnoEdu=noEdu&enterPriseGbn=all&cloTermSearchGbn=&birthToYY=&keywordWantedTitle=&stationNm=&benefitGbn=¬SrcKeywordParam=&keywordFlag=¬SrcKeyword=&essCertChk=&depth2SelCode=&keywordBusiNm=&preferentialGbn=&rot3WorkYn=®DateEndtParam=20241112&pfMatterPreferential=&pageIndex=1&termContractMmcnt=&careerFrom=&laborHrShortYn=#scrollLoc" + "https://www.work24.go.kr/wk/a/b/1200/retriveDtlEmpSrchList.do?basicSetupYn=&careerTo=&keywordJobCd=&occupation=&seqNo=&cloDateEndtParam=&payGbn=&templateInfo=&rot2WorkYn=&shsyWorkSecd=&srcKeywordParam=%EC%9A%94%EC%96%91+%EB%B3%B4%ED%98%B8%EC%82%AC&resultCnt=50&keywordJobCont=&cert=&moreButtonYn=Y&minPay=&codeDepth2Info=11000¤tPageNo=1&eventNo=&mode=&major=&resrDutyExcYn=&eodwYn=&sortField=DATE&staArea=&sortOrderBy=DESC&keyword=%EC%9A%94%EC%96%91+%EB%B3%B4%ED%98%B8%EC%82%AC&termSearchGbn=&carrEssYns=&benefitSrchAndOr=O&disableEmpHopeGbn=&actServExcYn=&keywordStaAreaNm=&maxPay=&emailApplyYn=&codeDepth1Info=11000&keywordEtcYn=®DateStdtParam=20241111&publDutyExcYn=&keywordJobCdSeqNo=&viewType=&exJobsCd=&templateDepthNmInfo=®ion=&employGbn=&empTpGbcd=1&computerPreferential=&infaYn=&cloDateStdtParam=&siteClcd=WORK&searchMode=Y&birthFromYY=&indArea=&careerTypes=&subEmpHopeYn=&tlmgYn=&academicGbn=&templateDepthNoInfo=&foriegn=&entryRoute=&mealOfferClcd=&basicSetupYnChk=&station=&holidayGbn=&srcKeyword=%EC%9A%94%EC%96%91+%EB%B3%B4%ED%98%B8%EC%82%AC&academicGbnoEdu=noEdu&enterPriseGbn=all&cloTermSearchGbn=&birthToYY=&keywordWantedTitle=&stationNm=&benefitGbn=¬SrcKeywordParam=&keywordFlag=¬SrcKeyword=&essCertChk=&depth2SelCode=&keywordBusiNm=&preferentialGbn=&rot3WorkYn=®DateEndtParam=20241112&pfMatterPreferential=&pageIndex=1&termContractMmcnt=&careerFrom=&laborHrShortYn=#scrollLoc" // "https://www.work24.go.kr/wk/a/b/1200/retriveDtlEmpSrchList.do?basicSetupYn=&careerTo=&keywordJobCd=&occupation=&seqNo=&cloDateEndtParam=&payGbn=&templateInfo=&rot2WorkYn=&shsyWorkSecd=&srcKeywordParam=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&resultCnt=10&keywordJobCont=&cert=&moreButtonYn=Y&minPay=&codeDepth2Info=11000¤tPageNo=1&eventNo=&mode=&major=&resrDutyExcYn=&eodwYn=&sortField=DATE&staArea=&sortOrderBy=DESC&keyword=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&termSearchGbn=all&carrEssYns=&benefitSrchAndOr=O&disableEmpHopeGbn=&actServExcYn=&keywordStaAreaNm=&maxPay=&emailApplyYn=&codeDepth1Info=11000&keywordEtcYn=®DateStdtParam=20241104&publDutyExcYn=&keywordJobCdSeqNo=&viewType=&exJobsCd=&templateDepthNmInfo=®ion=&employGbn=&empTpGbcd=&computerPreferential=&infaYn=&cloDateStdtParam=&siteClcd=WORK&searchMode=Y&birthFromYY=&indArea=&careerTypes=&subEmpHopeYn=&tlmgYn=&academicGbn=&templateDepthNoInfo=&foriegn=&entryRoute=&mealOfferClcd=&basicSetupYnChk=&station=&holidayGbn=&srcKeyword=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&academicGbnoEdu=noEdu&enterPriseGbn=all&cloTermSearchGbn=all&birthToYY=&keywordWantedTitle=&stationNm=&benefitGbn=¬SrcKeywordParam=&keywordFlag=¬SrcKeyword=&essCertChk=&depth2SelCode=&keywordBusiNm=&preferentialGbn=&rot3WorkYn=®DateEndtParam=20241108&pfMatterPreferential=&pageIndex={pageIndex}&termContractMmcnt=&careerFrom=&laborHrShortYn=#scrollLoc" @@ -60,75 +60,78 @@ object WorknetCrawler { } fun run(): List? { - try { - initializeDriver() - } catch (e: Exception) { - logger.error { e.toString() } - logError("run", e) - } - - logger.info { "=====초기화 완료, 크롤링 작업 시작" } - - val formatter = DateTimeFormatter.ofPattern("yyyyMMdd") - val yesterday = LocalDate.now().format(formatter) - val crawlingUrl = CRAWLING_TARGET_URL_FORMAT - .replace("{yesterday}", yesterday) - .replace("{pageIndex}", "1") - - driver.get(crawlingUrl) + return try { + try { + initializeDriver() + } catch (e: Exception) { + logger.error { e.toString() } + logError("run", e) + } - logger.info { "=====크롤링 url: $crawlingUrl" } + logger.info { "=====초기화 완료, 크롤링 작업 시작" } - val wait = WebDriverWait(driver, Duration.ofSeconds(15)) - wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//*[@id=\"mForm\"]/div[2]/div/div[1]/div[1]/span/span"))) + val formatter = DateTimeFormatter.ofPattern("yyyyMMdd") + val yesterday = LocalDate.now().format(formatter) + val crawlingUrl = CRAWLING_TARGET_URL_FORMAT + .replace("{yesterday}", yesterday) + .replace("{pageIndex}", "1") - val jobPostingCountText = - driver.findElement(By.xpath("//*[@id=\"mForm\"]/div[2]/div/div[1]/div[1]/span/span")).text + driver.get(crawlingUrl) - logger.info { "=====크롤링 대상 공고 수: $jobPostingCountText" } + logger.info { "=====크롤링 url: $crawlingUrl" } - val jobPostingCount = Integer.parseInt(jobPostingCountText.replace(",", "")) + val wait = WebDriverWait(driver, Duration.ofSeconds(15)) + wait.until(ExpectedConditions.visibilityOfElementLocated(By.xpath("//*[@id=\"mForm\"]/div[2]/div/div[1]/div[1]/span/span"))) - if (jobPostingCount == 0) { - driver.quit() - logger.info { "=====크롤링 할 공고가 없어 미리 종료합니다." } - return null - } + val jobPostingCountText = + driver.findElement(By.xpath("//*[@id=\"mForm\"]/div[2]/div/div[1]/div[1]/span/span")).text - val pageCount = jobPostingCount / JOB_POSTING_COUNT_PER_PAGE + logger.info { "=====크롤링 대상 공고 수: $jobPostingCountText" } - logger.warn { "===== 크롤링 페이지 수 " + pageCount } + val jobPostingCount = Integer.parseInt(jobPostingCountText.replace(",", "")) - for (i in 1..pageCount) { - if (i >= 2) { - val updatedCrawlingUrl = crawlingUrl - .replace("{yesterday}", yesterday) - .replace(Regex("pageIndex=\\d+"), "pageIndex=${i}") - driver.get(updatedCrawlingUrl) + if (jobPostingCount == 0) { + driver.quit() + logger.info { "=====크롤링 할 공고가 없어 미리 종료합니다." } + return emptyList() // 데이터가 없는 경우 빈 리스트 반환 } - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector("#list1"))) + val pageCount = jobPostingCount / JOB_POSTING_COUNT_PER_PAGE + logger.warn { "===== 크롤링 페이지 수 $pageCount" } - crawlPosts(1, JOB_POSTING_COUNT_PER_PAGE, postings) - } + for (i in 1..pageCount) { + if (i >= 2) { + val updatedCrawlingUrl = crawlingUrl + .replace("{yesterday}", yesterday) + .replace(Regex("pageIndex=\\d+"), "pageIndex=${i}") + driver.get(updatedCrawlingUrl) + } - val lastPageJobPostingCount = jobPostingCount % JOB_POSTING_COUNT_PER_PAGE + wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector("#list1"))) + crawlPosts(1, JOB_POSTING_COUNT_PER_PAGE, postings) + } - if (lastPageJobPostingCount > 0) { - val updateCrawlingUrl = crawlingUrl - .replace("{yesterday}", yesterday) - .replace(Regex("pageIndex=\\d+"), "pageIndex=${pageCount + 1}") - driver.get(updateCrawlingUrl) + val lastPageJobPostingCount = jobPostingCount % JOB_POSTING_COUNT_PER_PAGE + if (lastPageJobPostingCount > 0) { + val updateCrawlingUrl = crawlingUrl + .replace("{yesterday}", yesterday) + .replace(Regex("pageIndex=\\d+"), "pageIndex=${pageCount + 1}") + driver.get(updateCrawlingUrl) - wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector("#list1"))) + wait.until(ExpectedConditions.visibilityOfElementLocated(By.cssSelector("#list1"))) + crawlPosts(1, lastPageJobPostingCount, postings) + } - crawlPosts(1, lastPageJobPostingCount, postings) + postings // 정상적으로 크롤링이 완료되었을 경우 수집한 데이터를 반환 + } catch (e: Exception) { + logger.error { "Error occurred during crawling, returning collected data so far: ${e.message}" } + postings // 에러가 발생해도 현재까지 수집된 데이터를 반환 + } finally { + driver.quit() } - - driver.quit() - return postings } + private fun logError(method: String, e: Exception) { logger.error(e) { "Error occurred in $method: ${e.message}" } errorCountMap[method] = errorCountMap.getOrDefault(method, 0) + 1