Skip to content

Commit

Permalink
[IDLE-531] 배치 작업 정상화 (#260)
Browse files Browse the repository at this point in the history
* [IDLE-534] Flayway 재설정 및 배치 메타데이터 테이블 생성

* [IDLE-548] Tasklet을 Chunk로 변경

* [IDLE-549] 배치 실행 API 추가

* [IDLE-533] GeoCodeService 전환 메서드 static으로 전환

* [IDLE-533] 책임별 클래스 분리

* [IDLE-547] 멀티스레드 적용 및 공유자원 분리

* [IDLE-531] 크롤링 기준 날짜 변경
  • Loading branch information
mjj111 authored Jan 30, 2025
1 parent 1093626 commit ec970c1
Show file tree
Hide file tree
Showing 27 changed files with 851 additions and 553 deletions.
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
package com.swm.idle.batch.common.scheduler
package com.swm.idle.batch.common.launcher

import com.swm.idle.batch.job.CrawlingJobConfig
import com.swm.idle.batch.job.JobConfig
import org.springframework.batch.core.JobParameters
import org.springframework.batch.core.JobParametersBuilder
import org.springframework.batch.core.configuration.JobRegistry
import org.springframework.batch.core.launch.JobLauncher
import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Component

@Component
class CrawlingJobScheduler(
class CrawlingJobLauncher(
private val jobLauncher: JobLauncher,
private val crawlingJobConfig: CrawlingJobConfig,
private val jobRegistry: JobRegistry,
private val crawlingJobConfig: JobConfig,
) {

@Scheduled(cron = "0 0 23 * * *")
Expand All @@ -22,4 +24,11 @@ class CrawlingJobScheduler(
jobLauncher.run(crawlingJobConfig.crawlingJob(), jobParameters)
}

fun jobStart() {
val jobParameters: JobParameters = JobParametersBuilder()
.addLong("timestamp", System.currentTimeMillis())
.toJobParameters()

jobLauncher.run(jobRegistry.getJob("crawlingJob"), jobParameters)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package com.swm.idle.batch.crawler

enum class CrawlerConsts(val location: String, val value: String) {
CRAWLING_TARGET_URL_FORMAT("CRAWLING_TARGET_URL_FORMAT","https://www.work24.go.kr/wk/a/b/1200/retriveDtlEmpSrchList.do?basicSetupYn=&careerTo=&keywordJobCd=&occupation=&seqNo=&cloDateEndtParam=&payGbn=&templateInfo=&rot2WorkYn=&shsyWorkSecd=&srcKeywordParam=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&resultCnt=50&keywordJobCont=&cert=&moreButtonYn=Y&minPay=&codeDepth2Info=11000&currentPageNo=1&eventNo=&mode=&major=&resrDutyExcYn=&eodwYn=&sortField=DATE&staArea=&sortOrderBy=DESC&keyword=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&termSearchGbn=all&carrEssYns=&benefitSrchAndOr=O&disableEmpHopeGbn=&actServExcYn=&keywordStaAreaNm=&maxPay=&emailApplyYn=&codeDepth1Info=11000&keywordEtcYn=&regDateStdtParam={yesterday}&publDutyExcYn=&keywordJobCdSeqNo=&viewType=&exJobsCd=&templateDepthNmInfo=&region=&employGbn=&empTpGbcd=&computerPreferential=&infaYn=&cloDateStdtParam=&siteClcd=WORK&searchMode=Y&birthFromYY=&indArea=&careerTypes=&subEmpHopeYn=&tlmgYn=&academicGbn=&templateDepthNoInfo=&foriegn=&entryRoute=&mealOfferClcd=&basicSetupYnChk=&station=&holidayGbn=&srcKeyword=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&academicGbnoEdu=noEdu&enterPriseGbn=all&cloTermSearchGbn=all&birthToYY=&keywordWantedTitle=&stationNm=&benefitGbn=&notSrcKeywordParam=&keywordFlag=&notSrcKeyword=&essCertChk=&depth2SelCode=&keywordBusiNm=&preferentialGbn=&rot3WorkYn=&regDateEndtParam={yesterday}&pfMatterPreferential=&pageIndex={pageIndex}&termContractMmcnt=&careerFrom=&laborHrShortYn=#scrollLoc"),
JOB_POSTING_COUNT_PER_PAGE("JOB_POSTING_COUNT_PER_PAGE","50"),
JOB_POSTING_COUNT("JOB_POSTING_COUNT","//*[@id=\"mForm\"]/div[2]/div/div[1]/div[1]/span/span"),

//공고 정보
TITLE("TITLE", "//*[@id=\"contents\"]/div/div/div/div[1]/div[3]/div[1]/div[1]/strong"),
CONTENT("CONTENT", "//*[@id=\"tab-panel01\"]/div[1]/div"),

//근무 정보
PAY_INFO("PAY_INFO", "//*[@id=\"tab-panel02\"]/div/table/tbody/tr[1]/td[2]"),
WORK_TIME("WORK_TIME","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[2]/td"),
WORK_SCHEDULE("WORK_SCHEDULE","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[3]/td[2]"),

//모집 정보
RECRUITMENT_PROCESS("RECRUITMENT_PROCESS","//*[@id=\"tab-panel05\"]/div[2]/div/div[2]/p[1]"),
REQUIRED_DOCUMENT("REQUIRED_DOCUMENT","//*[@id=\"tab-panel05\"]/div[2]/div/div[2]/p[2]"),
APPLY_METHOD("APPLY_METHOD","//*[@id=\"tab-panel05\"]/div[2]/div/div[2]/p[1]"),
APPLY_DEADLINE("APPLY_DEADLINE","//*[@id=\"tab-panel05\"]/div[2]/div/div[1]/div[1]/p"),
CREATED_AT("CREATED_AT","//*[@id=\"contents\"]/div/div/div/div[1]/div[5]/div[11]/div[2]/table/tbody/tr[1]/td[1]"),

//센터 정보
CENTER_NAME("CENTER_NAME","//*[@id=\"contents\"]/div/div/div/div[1]/div[3]/div[1]/div[1]/p/strong"),
CENTER_ADDRESS1("CENTER_ADDRESS1","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"),
CENTER_ADDRESS2("CENTER_ADDRESs2","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"),
CENTER_ADDRESS3("CENTER_ADDRESS3","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"),

//노인 주소
CLIENT_ADDRESS1("CLIENT_ADDRESS1","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"),
CLIENT_ADDRESS2("CLIENT_ADDRESS2","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"),

//ChromDriver-Options
HEADLESS("HEADLESS","--headless"),
NO_SANDBOX("NO_SANDBOX","--no-sandbox"),
DISABLE_DEV_SHM_USAGE("DISABLE_DEV_SHM_USAGE","--disable-dev-shm-usage"),
DISABLE_GPU("DISABLE_GPU","--disable-gpu"),
WINDOW_SIZE("WINDOW_SIZE","window-size=1920x1080"),
DISABLE_SOFTWARE_RASTERIZER("DISABLE_SOFTWARE_RASTERIZER","--disable-software-rasterizer"),
IGNORE_SSL_ERRORS("IGNORE_SSL_ERRORS","--ignore-ssl-errors=yes"),
IGNORE_CERTIFICATE_ERRORS("IGNORE_CERTIFICATE_ERRORS","--ignore-certificate-errors");

companion object {
fun getChromOptions(): Array<String> {
return arrayOf(
HEADLESS.value,
NO_SANDBOX.value,
DISABLE_DEV_SHM_USAGE.value,
DISABLE_GPU.value,
WINDOW_SIZE.value,
DISABLE_SOFTWARE_RASTERIZER.value,
IGNORE_SSL_ERRORS.value,
IGNORE_CERTIFICATE_ERRORS.value
)
}
}

fun getIntValue(): Int {
return value.toInt()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package com.swm.idle.batch.crawler

import io.github.oshai.kotlinlogging.KotlinLogging
import org.openqa.selenium.chrome.ChromeDriver
import org.openqa.selenium.chrome.ChromeDriverService
import org.openqa.selenium.chrome.ChromeOptions
import java.io.File

object DriverInitializer {
private val logger = KotlinLogging.logger { }

fun init(): ChromeDriver {
return runCatching {
ChromeDriver(
ChromeDriverService.Builder()
.usingDriverExecutable(File(System.getenv("CHROMEDRIVER_BIN")))
.build()
.also { logger.info { System.getenv("CHROMEDRIVER_BIN") } },
ChromeOptions().apply {
addArguments(*CrawlerConsts.getChromOptions())
setBinary(System.getenv("CHROME_BIN"))
}.also { logger.info { System.getenv("CHROME_BIN")} }
)
}.getOrElse {
logger.error { "ChromeDriver initialization failed: ${it.message}" }
throw RuntimeException("ChromeDriver initialization failed, application will exit.") // 이후 코드가 실행되지 않도록 예외 던짐
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package com.swm.idle.batch.crawler

import com.swm.idle.batch.step.PostingReader
import org.openqa.selenium.By
import org.openqa.selenium.WebDriver
import org.openqa.selenium.support.ui.ExpectedConditions
import org.openqa.selenium.support.ui.WebDriverWait
import java.time.Duration
import java.time.LocalDate
import java.time.format.DateTimeFormatter

class WorknetPageCrawler {
private var driver: WebDriver = DriverInitializer.init()

fun initCounts(reader: PostingReader) {
reader.crawlingUrl = CrawlerConsts.CRAWLING_TARGET_URL_FORMAT.value
.replace("{yesterday}", LocalDate.now().format(DateTimeFormatter.ofPattern("yyyyMMdd")))
.replace("{pageIndex}", "1")

moveToPage(reader)

reader.postingCount = driver
.findElement(By.xpath(CrawlerConsts.JOB_POSTING_COUNT.value))
.text.toInt()
.takeIf { it > 0 }
?: run {
driver.quit()
throw Exception("크롤링 할 공고가 없습니다.")
}

reader.pageCount = (reader.postingCount + CrawlerConsts.JOB_POSTING_COUNT_PER_PAGE.getIntValue() - 1) /
CrawlerConsts.JOB_POSTING_COUNT_PER_PAGE.getIntValue()
reader.lastPageJobPostingCount = reader.postingCount % CrawlerConsts.JOB_POSTING_COUNT_PER_PAGE.getIntValue()
driver.quit()
}

private fun moveToPage(reader: PostingReader) {
driver.get(reader.crawlingUrl)
WebDriverWait(driver, Duration.ofSeconds(10))
.also {
it.until(ExpectedConditions.visibilityOfElementLocated(By.xpath(CrawlerConsts.JOB_POSTING_COUNT.value)))
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
package com.swm.idle.batch.crawler

import com.swm.idle.batch.common.dto.CrawledJobPostingDto
import io.github.oshai.kotlinlogging.KotlinLogging
import org.openqa.selenium.By
import org.openqa.selenium.WebDriver
import org.openqa.selenium.support.ui.ExpectedConditions
import org.openqa.selenium.support.ui.WebDriverWait
import java.time.Duration
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import org.openqa.selenium.WebElement

class WorknetPostCrawler {
private val logger = KotlinLogging.logger { }
private var driver: WebDriver = DriverInitializer.init()
private var errorCountMap: MutableMap<String, Int> = mutableMapOf()

fun crawlPosts(end: Int, url: String): List<CrawledJobPostingDto> {
moveToPage(url)

val crawledPostings = mutableListOf<CrawledJobPostingDto>()
repeat(end) { i ->
val originalWindow = driver.windowHandle
val titleElement = findElementSafe(By.xpath("//*[@id=\"list${i+1}\"]/td[1]/div/div[2]/a")) ?: return@repeat

moveToPostDetailWindow(titleElement, originalWindow)

try {
val post: CrawledJobPostingDto = createPost()
crawledPostings.add(post)
} catch (e: Exception) {
logger.warn { "실패" }
}

backWindow(originalWindow)
}
errorCountMap.asSequence().forEach { (key, value) -> println("$key -> $value") }
driver.quit()
return crawledPostings
}

private fun moveToPage(url: String) {
driver.get(url)
WebDriverWait(driver, Duration.ofSeconds(10))
.until(
ExpectedConditions.visibilityOfElementLocated(By.cssSelector("#list1"))
)
}

private fun createPost(): CrawledJobPostingDto {
return CrawledJobPostingDto(
title = extractText(CrawlerConsts.TITLE),
content = extractText(CrawlerConsts.CONTENT),
createdAt = extractText(CrawlerConsts.CREATED_AT),
payInfo = extractText(CrawlerConsts.PAY_INFO),
workSchedule = extractText(CrawlerConsts.WORK_SCHEDULE),
recruitmentProcess = extractText(CrawlerConsts.RECRUITMENT_PROCESS),
applyMethod = extractText(CrawlerConsts.APPLY_METHOD),
requiredDocument = extractText(CrawlerConsts.REQUIRED_DOCUMENT),
centerName = extractText(CrawlerConsts.CENTER_NAME),
applyDeadline = extractApplyDeadline(CrawlerConsts.APPLY_DEADLINE),
workTime = extractWorkTime(CrawlerConsts.WORK_TIME),
centerAddress = extractAddress(
CrawlerConsts.CLIENT_ADDRESS1,
CrawlerConsts.CLIENT_ADDRESS2
),
clientAddress = extractAddress(
CrawlerConsts.CENTER_ADDRESS1,
CrawlerConsts.CENTER_ADDRESS2,
CrawlerConsts.CENTER_ADDRESS3
),
directUrl = driver.currentUrl
)
}


private inline fun <T> errorRecord(location: String, action: () -> T): T {
return runCatching { action() }
.getOrElse { e ->
logError(location)
throw e
}
}

private fun findElementSafe(by: By): WebElement? {
return runCatching { driver.findElement(by) }.getOrNull()
}

private fun moveToPostDetailWindow(titleElement: WebElement, originalWindow: String) {
titleElement.click()
WebDriverWait(driver, Duration.ofSeconds(10))
.until(ExpectedConditions.numberOfWindowsToBe(2))
driver.switchTo().window(driver.windowHandles.first { it != originalWindow })
}

private fun extractText(con: CrawlerConsts): String {
return errorRecord(con.location) { driver.findElement(By.xpath(con.value)).text }
}

private fun extractApplyDeadline(con: CrawlerConsts): String {
return errorRecord(con.location) {
driver.findElement(By.xpath(con.value)).text.let {
if (it.contains("채용시까지"))
LocalDate.now().plusDays(15).format(DateTimeFormatter.ofPattern("yyyyMMdd"))
else
it
}
}
}

private fun extractAddress(vararg cons: CrawlerConsts): String {
for (con in cons) {
runCatching {
val address = driver.findElement(By.xpath(con.value)).text
return address.replace("지도보기", "").trim().replace(Regex("\\(\\d{5}\\)"), "").trim()
} .getOrElse { e ->
logError(con.location)
throw e
}
}
throw NoSuchElementException("Center address not found using any of the provided XPaths")
}

private fun extractWorkTime(con: CrawlerConsts): String {
return errorRecord(con.location) {
driver.findElement(By.xpath(con.value)).text
.replace("도움말", "")
.replace("(근무시간)", "")
.replace("\n", "")
}
}

private fun logError(location: String) {
errorCountMap[location] = errorCountMap.getOrDefault(location, 0) + 1
}

private fun backWindow(originalWindow: String?) {
driver.close()
driver.switchTo().window(originalWindow)
}
}

This file was deleted.

Loading

0 comments on commit ec970c1

Please sign in to comment.