-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* [IDLE-534] Flayway 재설정 및 배치 메타데이터 테이블 생성 * [IDLE-548] Tasklet을 Chunk로 변경 * [IDLE-549] 배치 실행 API 추가 * [IDLE-533] GeoCodeService 전환 메서드 static으로 전환 * [IDLE-533] 책임별 클래스 분리 * [IDLE-547] 멀티스레드 적용 및 공유자원 분리 * [IDLE-531] 크롤링 기준 날짜 변경
- Loading branch information
Showing
27 changed files
with
851 additions
and
553 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
62 changes: 62 additions & 0 deletions
62
idle-batch/src/main/kotlin/com/swm/idle/batch/crawler/CrawlerConsts.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
package com.swm.idle.batch.crawler | ||
|
||
enum class CrawlerConsts(val location: String, val value: String) { | ||
CRAWLING_TARGET_URL_FORMAT("CRAWLING_TARGET_URL_FORMAT","https://www.work24.go.kr/wk/a/b/1200/retriveDtlEmpSrchList.do?basicSetupYn=&careerTo=&keywordJobCd=&occupation=&seqNo=&cloDateEndtParam=&payGbn=&templateInfo=&rot2WorkYn=&shsyWorkSecd=&srcKeywordParam=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&resultCnt=50&keywordJobCont=&cert=&moreButtonYn=Y&minPay=&codeDepth2Info=11000¤tPageNo=1&eventNo=&mode=&major=&resrDutyExcYn=&eodwYn=&sortField=DATE&staArea=&sortOrderBy=DESC&keyword=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&termSearchGbn=all&carrEssYns=&benefitSrchAndOr=O&disableEmpHopeGbn=&actServExcYn=&keywordStaAreaNm=&maxPay=&emailApplyYn=&codeDepth1Info=11000&keywordEtcYn=®DateStdtParam={yesterday}&publDutyExcYn=&keywordJobCdSeqNo=&viewType=&exJobsCd=&templateDepthNmInfo=®ion=&employGbn=&empTpGbcd=&computerPreferential=&infaYn=&cloDateStdtParam=&siteClcd=WORK&searchMode=Y&birthFromYY=&indArea=&careerTypes=&subEmpHopeYn=&tlmgYn=&academicGbn=&templateDepthNoInfo=&foriegn=&entryRoute=&mealOfferClcd=&basicSetupYnChk=&station=&holidayGbn=&srcKeyword=%EC%9A%94%EC%96%91%EB%B3%B4%ED%98%B8%EC%82%AC&academicGbnoEdu=noEdu&enterPriseGbn=all&cloTermSearchGbn=all&birthToYY=&keywordWantedTitle=&stationNm=&benefitGbn=¬SrcKeywordParam=&keywordFlag=¬SrcKeyword=&essCertChk=&depth2SelCode=&keywordBusiNm=&preferentialGbn=&rot3WorkYn=®DateEndtParam={yesterday}&pfMatterPreferential=&pageIndex={pageIndex}&termContractMmcnt=&careerFrom=&laborHrShortYn=#scrollLoc"), | ||
JOB_POSTING_COUNT_PER_PAGE("JOB_POSTING_COUNT_PER_PAGE","50"), | ||
JOB_POSTING_COUNT("JOB_POSTING_COUNT","//*[@id=\"mForm\"]/div[2]/div/div[1]/div[1]/span/span"), | ||
|
||
//공고 정보 | ||
TITLE("TITLE", "//*[@id=\"contents\"]/div/div/div/div[1]/div[3]/div[1]/div[1]/strong"), | ||
CONTENT("CONTENT", "//*[@id=\"tab-panel01\"]/div[1]/div"), | ||
|
||
//근무 정보 | ||
PAY_INFO("PAY_INFO", "//*[@id=\"tab-panel02\"]/div/table/tbody/tr[1]/td[2]"), | ||
WORK_TIME("WORK_TIME","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[2]/td"), | ||
WORK_SCHEDULE("WORK_SCHEDULE","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[3]/td[2]"), | ||
|
||
//모집 정보 | ||
RECRUITMENT_PROCESS("RECRUITMENT_PROCESS","//*[@id=\"tab-panel05\"]/div[2]/div/div[2]/p[1]"), | ||
REQUIRED_DOCUMENT("REQUIRED_DOCUMENT","//*[@id=\"tab-panel05\"]/div[2]/div/div[2]/p[2]"), | ||
APPLY_METHOD("APPLY_METHOD","//*[@id=\"tab-panel05\"]/div[2]/div/div[2]/p[1]"), | ||
APPLY_DEADLINE("APPLY_DEADLINE","//*[@id=\"tab-panel05\"]/div[2]/div/div[1]/div[1]/p"), | ||
CREATED_AT("CREATED_AT","//*[@id=\"contents\"]/div/div/div/div[1]/div[5]/div[11]/div[2]/table/tbody/tr[1]/td[1]"), | ||
|
||
//센터 정보 | ||
CENTER_NAME("CENTER_NAME","//*[@id=\"contents\"]/div/div/div/div[1]/div[3]/div[1]/div[1]/p/strong"), | ||
CENTER_ADDRESS1("CENTER_ADDRESS1","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"), | ||
CENTER_ADDRESS2("CENTER_ADDRESs2","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"), | ||
CENTER_ADDRESS3("CENTER_ADDRESS3","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"), | ||
|
||
//노인 주소 | ||
CLIENT_ADDRESS1("CLIENT_ADDRESS1","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"), | ||
CLIENT_ADDRESS2("CLIENT_ADDRESS2","//*[@id=\"tab-panel02\"]/div/table/tbody/tr[5]/td/div[1]/p"), | ||
|
||
//ChromDriver-Options | ||
HEADLESS("HEADLESS","--headless"), | ||
NO_SANDBOX("NO_SANDBOX","--no-sandbox"), | ||
DISABLE_DEV_SHM_USAGE("DISABLE_DEV_SHM_USAGE","--disable-dev-shm-usage"), | ||
DISABLE_GPU("DISABLE_GPU","--disable-gpu"), | ||
WINDOW_SIZE("WINDOW_SIZE","window-size=1920x1080"), | ||
DISABLE_SOFTWARE_RASTERIZER("DISABLE_SOFTWARE_RASTERIZER","--disable-software-rasterizer"), | ||
IGNORE_SSL_ERRORS("IGNORE_SSL_ERRORS","--ignore-ssl-errors=yes"), | ||
IGNORE_CERTIFICATE_ERRORS("IGNORE_CERTIFICATE_ERRORS","--ignore-certificate-errors"); | ||
|
||
companion object { | ||
fun getChromOptions(): Array<String> { | ||
return arrayOf( | ||
HEADLESS.value, | ||
NO_SANDBOX.value, | ||
DISABLE_DEV_SHM_USAGE.value, | ||
DISABLE_GPU.value, | ||
WINDOW_SIZE.value, | ||
DISABLE_SOFTWARE_RASTERIZER.value, | ||
IGNORE_SSL_ERRORS.value, | ||
IGNORE_CERTIFICATE_ERRORS.value | ||
) | ||
} | ||
} | ||
|
||
fun getIntValue(): Int { | ||
return value.toInt() | ||
} | ||
} |
29 changes: 29 additions & 0 deletions
29
idle-batch/src/main/kotlin/com/swm/idle/batch/crawler/DriverInitializer.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package com.swm.idle.batch.crawler | ||
|
||
import io.github.oshai.kotlinlogging.KotlinLogging | ||
import org.openqa.selenium.chrome.ChromeDriver | ||
import org.openqa.selenium.chrome.ChromeDriverService | ||
import org.openqa.selenium.chrome.ChromeOptions | ||
import java.io.File | ||
|
||
object DriverInitializer { | ||
private val logger = KotlinLogging.logger { } | ||
|
||
fun init(): ChromeDriver { | ||
return runCatching { | ||
ChromeDriver( | ||
ChromeDriverService.Builder() | ||
.usingDriverExecutable(File(System.getenv("CHROMEDRIVER_BIN"))) | ||
.build() | ||
.also { logger.info { System.getenv("CHROMEDRIVER_BIN") } }, | ||
ChromeOptions().apply { | ||
addArguments(*CrawlerConsts.getChromOptions()) | ||
setBinary(System.getenv("CHROME_BIN")) | ||
}.also { logger.info { System.getenv("CHROME_BIN")} } | ||
) | ||
}.getOrElse { | ||
logger.error { "ChromeDriver initialization failed: ${it.message}" } | ||
throw RuntimeException("ChromeDriver initialization failed, application will exit.") // 이후 코드가 실행되지 않도록 예외 던짐 | ||
} | ||
} | ||
} |
44 changes: 44 additions & 0 deletions
44
idle-batch/src/main/kotlin/com/swm/idle/batch/crawler/WorknetPageCrawler.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
package com.swm.idle.batch.crawler | ||
|
||
import com.swm.idle.batch.step.PostingReader | ||
import org.openqa.selenium.By | ||
import org.openqa.selenium.WebDriver | ||
import org.openqa.selenium.support.ui.ExpectedConditions | ||
import org.openqa.selenium.support.ui.WebDriverWait | ||
import java.time.Duration | ||
import java.time.LocalDate | ||
import java.time.format.DateTimeFormatter | ||
|
||
class WorknetPageCrawler { | ||
private var driver: WebDriver = DriverInitializer.init() | ||
|
||
fun initCounts(reader: PostingReader) { | ||
reader.crawlingUrl = CrawlerConsts.CRAWLING_TARGET_URL_FORMAT.value | ||
.replace("{yesterday}", LocalDate.now().format(DateTimeFormatter.ofPattern("yyyyMMdd"))) | ||
.replace("{pageIndex}", "1") | ||
|
||
moveToPage(reader) | ||
|
||
reader.postingCount = driver | ||
.findElement(By.xpath(CrawlerConsts.JOB_POSTING_COUNT.value)) | ||
.text.toInt() | ||
.takeIf { it > 0 } | ||
?: run { | ||
driver.quit() | ||
throw Exception("크롤링 할 공고가 없습니다.") | ||
} | ||
|
||
reader.pageCount = (reader.postingCount + CrawlerConsts.JOB_POSTING_COUNT_PER_PAGE.getIntValue() - 1) / | ||
CrawlerConsts.JOB_POSTING_COUNT_PER_PAGE.getIntValue() | ||
reader.lastPageJobPostingCount = reader.postingCount % CrawlerConsts.JOB_POSTING_COUNT_PER_PAGE.getIntValue() | ||
driver.quit() | ||
} | ||
|
||
private fun moveToPage(reader: PostingReader) { | ||
driver.get(reader.crawlingUrl) | ||
WebDriverWait(driver, Duration.ofSeconds(10)) | ||
.also { | ||
it.until(ExpectedConditions.visibilityOfElementLocated(By.xpath(CrawlerConsts.JOB_POSTING_COUNT.value))) | ||
} | ||
} | ||
} |
142 changes: 142 additions & 0 deletions
142
idle-batch/src/main/kotlin/com/swm/idle/batch/crawler/WorknetPostCrawler.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
package com.swm.idle.batch.crawler | ||
|
||
import com.swm.idle.batch.common.dto.CrawledJobPostingDto | ||
import io.github.oshai.kotlinlogging.KotlinLogging | ||
import org.openqa.selenium.By | ||
import org.openqa.selenium.WebDriver | ||
import org.openqa.selenium.support.ui.ExpectedConditions | ||
import org.openqa.selenium.support.ui.WebDriverWait | ||
import java.time.Duration | ||
import java.time.LocalDate | ||
import java.time.format.DateTimeFormatter | ||
import org.openqa.selenium.WebElement | ||
|
||
class WorknetPostCrawler { | ||
private val logger = KotlinLogging.logger { } | ||
private var driver: WebDriver = DriverInitializer.init() | ||
private var errorCountMap: MutableMap<String, Int> = mutableMapOf() | ||
|
||
fun crawlPosts(end: Int, url: String): List<CrawledJobPostingDto> { | ||
moveToPage(url) | ||
|
||
val crawledPostings = mutableListOf<CrawledJobPostingDto>() | ||
repeat(end) { i -> | ||
val originalWindow = driver.windowHandle | ||
val titleElement = findElementSafe(By.xpath("//*[@id=\"list${i+1}\"]/td[1]/div/div[2]/a")) ?: return@repeat | ||
|
||
moveToPostDetailWindow(titleElement, originalWindow) | ||
|
||
try { | ||
val post: CrawledJobPostingDto = createPost() | ||
crawledPostings.add(post) | ||
} catch (e: Exception) { | ||
logger.warn { "실패" } | ||
} | ||
|
||
backWindow(originalWindow) | ||
} | ||
errorCountMap.asSequence().forEach { (key, value) -> println("$key -> $value") } | ||
driver.quit() | ||
return crawledPostings | ||
} | ||
|
||
private fun moveToPage(url: String) { | ||
driver.get(url) | ||
WebDriverWait(driver, Duration.ofSeconds(10)) | ||
.until( | ||
ExpectedConditions.visibilityOfElementLocated(By.cssSelector("#list1")) | ||
) | ||
} | ||
|
||
private fun createPost(): CrawledJobPostingDto { | ||
return CrawledJobPostingDto( | ||
title = extractText(CrawlerConsts.TITLE), | ||
content = extractText(CrawlerConsts.CONTENT), | ||
createdAt = extractText(CrawlerConsts.CREATED_AT), | ||
payInfo = extractText(CrawlerConsts.PAY_INFO), | ||
workSchedule = extractText(CrawlerConsts.WORK_SCHEDULE), | ||
recruitmentProcess = extractText(CrawlerConsts.RECRUITMENT_PROCESS), | ||
applyMethod = extractText(CrawlerConsts.APPLY_METHOD), | ||
requiredDocument = extractText(CrawlerConsts.REQUIRED_DOCUMENT), | ||
centerName = extractText(CrawlerConsts.CENTER_NAME), | ||
applyDeadline = extractApplyDeadline(CrawlerConsts.APPLY_DEADLINE), | ||
workTime = extractWorkTime(CrawlerConsts.WORK_TIME), | ||
centerAddress = extractAddress( | ||
CrawlerConsts.CLIENT_ADDRESS1, | ||
CrawlerConsts.CLIENT_ADDRESS2 | ||
), | ||
clientAddress = extractAddress( | ||
CrawlerConsts.CENTER_ADDRESS1, | ||
CrawlerConsts.CENTER_ADDRESS2, | ||
CrawlerConsts.CENTER_ADDRESS3 | ||
), | ||
directUrl = driver.currentUrl | ||
) | ||
} | ||
|
||
|
||
private inline fun <T> errorRecord(location: String, action: () -> T): T { | ||
return runCatching { action() } | ||
.getOrElse { e -> | ||
logError(location) | ||
throw e | ||
} | ||
} | ||
|
||
private fun findElementSafe(by: By): WebElement? { | ||
return runCatching { driver.findElement(by) }.getOrNull() | ||
} | ||
|
||
private fun moveToPostDetailWindow(titleElement: WebElement, originalWindow: String) { | ||
titleElement.click() | ||
WebDriverWait(driver, Duration.ofSeconds(10)) | ||
.until(ExpectedConditions.numberOfWindowsToBe(2)) | ||
driver.switchTo().window(driver.windowHandles.first { it != originalWindow }) | ||
} | ||
|
||
private fun extractText(con: CrawlerConsts): String { | ||
return errorRecord(con.location) { driver.findElement(By.xpath(con.value)).text } | ||
} | ||
|
||
private fun extractApplyDeadline(con: CrawlerConsts): String { | ||
return errorRecord(con.location) { | ||
driver.findElement(By.xpath(con.value)).text.let { | ||
if (it.contains("채용시까지")) | ||
LocalDate.now().plusDays(15).format(DateTimeFormatter.ofPattern("yyyyMMdd")) | ||
else | ||
it | ||
} | ||
} | ||
} | ||
|
||
private fun extractAddress(vararg cons: CrawlerConsts): String { | ||
for (con in cons) { | ||
runCatching { | ||
val address = driver.findElement(By.xpath(con.value)).text | ||
return address.replace("지도보기", "").trim().replace(Regex("\\(\\d{5}\\)"), "").trim() | ||
} .getOrElse { e -> | ||
logError(con.location) | ||
throw e | ||
} | ||
} | ||
throw NoSuchElementException("Center address not found using any of the provided XPaths") | ||
} | ||
|
||
private fun extractWorkTime(con: CrawlerConsts): String { | ||
return errorRecord(con.location) { | ||
driver.findElement(By.xpath(con.value)).text | ||
.replace("도움말", "") | ||
.replace("(근무시간)", "") | ||
.replace("\n", "") | ||
} | ||
} | ||
|
||
private fun logError(location: String) { | ||
errorCountMap[location] = errorCountMap.getOrDefault(location, 0) + 1 | ||
} | ||
|
||
private fun backWindow(originalWindow: String?) { | ||
driver.close() | ||
driver.switchTo().window(originalWindow) | ||
} | ||
} |
34 changes: 0 additions & 34 deletions
34
idle-batch/src/main/kotlin/com/swm/idle/batch/job/CrawlingJobConfig.kt
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.