Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added merge functionality of final documents. #6

Merged
merged 2 commits into from
Sep 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ replay_pid*
# Ignore Gradle build output directory
build
/.idea/
/data/
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ This application is designed to manage OCR (Optical Character Recognition) tasks
- **User wants to OCR one image with some english text and as output wants pdf document. User wants to do OCR immediately.**
- **User wants to OCR multiple images. User wants to do OCR immediately.**
- **User wants to OCR multipage pdf document, and as output wants pdf document. User wants to start execution at specified time.**


## License
This project is licensed under the Apache License - see the LICENSE file for details.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package com.github.nenadjakic.ocr.studio.executor
import com.github.nenadjakic.ocr.studio.config.OcrProperties
import com.github.nenadjakic.ocr.studio.entity.OcrConfig
import com.github.nenadjakic.ocr.studio.entity.OutDocument
import com.github.nenadjakic.ocr.studio.entity.Task
import com.github.nenadjakic.ocr.studio.extension.toOcrProgress
import com.github.nenadjakic.ocr.studio.handler.sax.HocrSaxHandler
import com.github.nenadjakic.ocr.studio.repository.TaskRepository
import com.github.nenadjakic.ocr.studio.service.TaskFileSystemService
import net.sourceforge.tess4j.ITesseract
Expand All @@ -13,12 +15,12 @@ import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.ImageType
import org.apache.pdfbox.rendering.PDFRenderer
import org.slf4j.LoggerFactory
import java.io.File
import java.io.IOException
import java.io.*
import java.nio.file.Path
import java.time.ZonedDateTime
import java.util.*
import javax.imageio.ImageIO
import javax.xml.parsers.SAXParserFactory

class OcrExecutor(
override val id: UUID,
Expand All @@ -35,12 +37,13 @@ class OcrExecutor(
logger.info("Started OCR for task: {}. Number of documents: {}", task.id, task.inDocuments.size)
progressInfo.progressInfoStatus = ProgressInfo.ProgressInfoStatus.IN_PROGRESS
progressInfo.totalTasks = task.inDocuments.size
progressInfo.description = "In progress..."
progressInfo.description = "Starting ocr process..."

task.ocrProgress = progressInfo.toOcrProgress()
taskRepository.save(task)
try {
for (document in task.inDocuments) {
progressInfo.description = "Starting ocr of documents..."
for (document in task.inDocuments.sortedBy { it.originalFileName }) {
val inFile =
Path.of(ocrProperties.taskPath, task.id.toString(), "input", document.randomizedFileName).toFile()
if (inFile.exists()) {
Expand All @@ -53,6 +56,7 @@ class OcrExecutor(

val filesToOcr = preProcessDocument(task.ocrConfig.preProcessing, inFile)
if (filesToOcr.size > 1) {
progressInfo.description = "Starting ocr of documents..."
logger.info("Starting ocr of multi paged document.")
PDDocument().use { pdDocument ->
filesToOcr.entries.sortedBy { it.key }.forEach { filesToOcrEntry ->
Expand Down Expand Up @@ -81,7 +85,6 @@ class OcrExecutor(
val index = it.key
val fileToOcr = it.value

val tempOutFile = File.createTempFile("___", ".tmp")
tesseract.createDocuments(
fileToOcr.absolutePath,
outFile.absolutePath,
Expand All @@ -91,11 +94,24 @@ class OcrExecutor(
}
progressInfo.taskDone++
}
task.ocrProgress = progressInfo.toOcrProgress()
taskRepository.save(task)
}
if (task.ocrConfig.mergeDocuments) {
progressInfo.description = "Starting merging of documents..."
val mergedFile =
Path.of(ocrProperties.taskPath, task.id.toString(), "output", "merged_" + UUID.randomUUID() + "." + task.ocrConfig.fileFormat.getExtension())
.toFile()

when (task.ocrConfig.fileFormat) {
OcrConfig.FileFormat.TEXT -> {
mergeTextDocuments(mergedFile, task)
}
OcrConfig.FileFormat.PDF -> {
mergePdfDocuments(mergedFile, task)
}
OcrConfig.FileFormat.HOCR -> {
mergeHocrDocuments(mergedFile, task)
}
}
}
progressInfo.progressInfoStatus = ProgressInfo.ProgressInfoStatus.FINISHED
task.ocrProgress = progressInfo.toOcrProgress()
Expand All @@ -106,6 +122,67 @@ class OcrExecutor(
}
}

private fun mergeTextDocuments(mergedFile: File, task: Task) {
BufferedWriter(FileWriter(mergedFile)).use { writer ->
for (document in task.inDocuments.sortedBy { it.originalFileName }) {
val file = Path.of(ocrProperties.taskPath, task.id.toString(), "output", document.outDocument!!.outputFileName + "." + task.ocrConfig.fileFormat.getExtension()).toFile()
BufferedReader(FileReader(file)).use { reader ->
var line: String?
while (reader.readLine().also { line = it } != null) {
writer.write(line)
writer.newLine()
}
}
writer.newLine()
}
}
}

private fun mergeHocrDocuments(mergedFile: File, task: Task) {
val saxParserFactory = SAXParserFactory.newInstance()
val saxParser = saxParserFactory.newSAXParser()
val saxHandler = HocrSaxHandler()

BufferedWriter(FileWriter(mergedFile)).use { writer ->
//writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
//writer.newLine()
// writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
writer.write("<html>")
writer.newLine()
for ((index, document) in task.inDocuments.sortedBy { it.originalFileName }.withIndex()) {
val file = Path.of(ocrProperties.taskPath, task.id.toString(), "output", document.outDocument!!.outputFileName + "." + task.ocrConfig.fileFormat.getExtension()).toFile()

saxParser.parse(file, saxHandler)
if (index == 0) {
writer.write(" <head>")
writer.newLine()
writer.write(saxHandler.head)
writer.write(" </head>")
writer.newLine()
writer.write(" <body>")
}
writer.write(saxHandler.body)
}
writer.newLine()
writer.write(" </body>")
writer.newLine()
writer.write("</html>")
}
}

private fun mergePdfDocuments(mergedFile: File, task: Task) {
PDDocument().use { pdDocument ->
for (document in task.inDocuments.sortedBy { it.originalFileName }) {
val file = Path.of(ocrProperties.taskPath, task.id.toString(), "output", document.outDocument!!.outputFileName + "." + task.ocrConfig.fileFormat.getExtension()).toFile()
val outDocument = Loader.loadPDF(file)
outDocument.pages.forEach { page ->
pdDocument.addPage(page)
}
}
pdDocument.save(mergedFile)
}
}

private data class InputData (
val fileFormat: OcrConfig.FileFormat,
val file: File
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package com.github.nenadjakic.ocr.studio.handler.sax

import org.xml.sax.Attributes
import org.xml.sax.helpers.DefaultHandler

class HocrSaxHandler: DefaultHandler() {
enum class InsideElement { HEAD, BODY, OTHER}
private var insideElement = InsideElement.OTHER

private lateinit var bodyBuilder: StringBuilder
private lateinit var headBuilder: StringBuilder

val body: String
get() = bodyBuilder.toString()

val head: String
get() = headBuilder.toString()


override fun startDocument() {
bodyBuilder = StringBuilder()
headBuilder = StringBuilder()
}

override fun startElement(uri: String?, localName: String?, qName: String?, attributes: Attributes?) {
if (qName.equals("head", true)) {
insideElement = InsideElement.HEAD
} else if (qName.equals("body", true)) {
insideElement = InsideElement.BODY
} else if (insideElement == InsideElement.BODY) {
appendStartElement(bodyBuilder, qName, attributes)
} else if (insideElement == InsideElement.HEAD) {
appendStartElement(headBuilder, qName, attributes)
}
}

override fun endElement(uri: String?, localName: String?, qName: String?) {
if (qName.equals("body", true) || qName.equals("head", true)) {
insideElement = InsideElement.OTHER
} else if (insideElement == InsideElement.BODY) {
bodyBuilder.append("</$qName>")
} else if (insideElement == InsideElement.HEAD) {
headBuilder.append("</$qName>")
}
}

override fun characters(ch: CharArray?, start: Int, length: Int) {
if (insideElement == InsideElement.BODY) {
bodyBuilder.append(ch, start, length)
} else if (insideElement == InsideElement.HEAD) {
headBuilder.append(ch, start, length)
}
}

private fun appendStartElement(stringBuilder: StringBuilder, qName: String?, attributes: Attributes?) {
stringBuilder.append("<$qName")

if (attributes != null && attributes.length > 0) {
for (i in 0 until attributes.length) {
val attributeName = attributes.getQName(i)
val attributeValue = attributes.getValue(i)
if (attributeValue.contains("'")) {
stringBuilder.append(" $attributeName=\"$attributeValue\"")
} else {
stringBuilder.append(" $attributeName='$attributeValue'")
}
}
}
stringBuilder.append(">")
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package com.github.nenadjakic.ocr.studio.service

import com.github.nenadjakic.ocr.studio.config.OcrProperties
import com.github.nenadjakic.ocr.studio.entity.Document
import com.github.nenadjakic.ocr.studio.entity.OcrConfig
import com.github.nenadjakic.ocr.studio.entity.SchedulerConfig
Expand All @@ -17,8 +16,7 @@ import java.util.*
@Service
class TaskService(
private val taskRepository: TaskRepository,
private val taskFileSystemService: TaskFileSystemService,
private val ocrProperties: OcrProperties
private val taskFileSystemService: TaskFileSystemService
) {

fun findAll(): List<Task> = taskRepository.findAll(Sort.by(Sort.Order.asc("id")))
Expand All @@ -37,7 +35,7 @@ class TaskService(

fun insert(entity: Task, files: Collection<MultipartFile>? = emptyList()): Task {
val createdEntity = insert(entity)
if (files != null && !files.isEmpty()) {
if (!files.isNullOrEmpty()) {
upload(createdEntity.id!!, files)
}
return createdEntity
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ logging.level.org.springframework.data.mongodb.core.MongoTemplate=DEBUG
logging.level.com.github.nenadjakic.ocr.studio=DEBUG


ocr.task-path=
ocr.task-path=./data/tasks
ocr.tesseract.data-path=\\\\wsl.localhost\\Ubuntu-22.04\\usr\\share\\tesseract-ocr\\4.00\\tessdata
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,13 @@ class TaskServiceTest {
private lateinit var taskService: TaskService
private lateinit var taskRepository: TaskRepository
private lateinit var taskFileSystemService: TaskFileSystemService
private lateinit var ocrProperties: OcrProperties

@BeforeEach
fun setUp() {
taskRepository = mock(TaskRepository::class.java)
taskFileSystemService = mock(TaskFileSystemService::class.java)
ocrProperties = mock(OcrProperties::class.java)

taskService = TaskService(taskRepository, taskFileSystemService, ocrProperties)
taskService = TaskService(taskRepository, taskFileSystemService)
}

@AfterEach
Expand Down