Skip to content

Commit

Permalink
Merge pull request #6 from nenadjakic/feature/merge_final_documents
Browse files Browse the repository at this point in the history
Added merge functionality of final documents.
  • Loading branch information
nenadjakic authored Sep 7, 2024
2 parents 4062bf7 + 755c45e commit a87a5a4
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 16 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ replay_pid*
# Ignore Gradle build output directory
build
/.idea/
/data/
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@ This application is designed to manage OCR (Optical Character Recognition) tasks
- **User wants to OCR one image with some english text and as output wants pdf document. User wants to do OCR immediately.**
- **User wants to OCR multiple images. User wants to do OCR immediately.**
- **User wants to OCR multipage pdf document, and as output wants pdf document. User wants to start execution at specified time.**


## License
This project is licensed under the Apache License - see the LICENSE file for details.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package com.github.nenadjakic.ocr.studio.executor
import com.github.nenadjakic.ocr.studio.config.OcrProperties
import com.github.nenadjakic.ocr.studio.entity.OcrConfig
import com.github.nenadjakic.ocr.studio.entity.OutDocument
import com.github.nenadjakic.ocr.studio.entity.Task
import com.github.nenadjakic.ocr.studio.extension.toOcrProgress
import com.github.nenadjakic.ocr.studio.handler.sax.HocrSaxHandler
import com.github.nenadjakic.ocr.studio.repository.TaskRepository
import com.github.nenadjakic.ocr.studio.service.TaskFileSystemService
import net.sourceforge.tess4j.ITesseract
Expand All @@ -13,12 +15,12 @@ import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.ImageType
import org.apache.pdfbox.rendering.PDFRenderer
import org.slf4j.LoggerFactory
import java.io.File
import java.io.IOException
import java.io.*
import java.nio.file.Path
import java.time.ZonedDateTime
import java.util.*
import javax.imageio.ImageIO
import javax.xml.parsers.SAXParserFactory

class OcrExecutor(
override val id: UUID,
Expand All @@ -35,12 +37,13 @@ class OcrExecutor(
logger.info("Started OCR for task: {}. Number of documents: {}", task.id, task.inDocuments.size)
progressInfo.progressInfoStatus = ProgressInfo.ProgressInfoStatus.IN_PROGRESS
progressInfo.totalTasks = task.inDocuments.size
progressInfo.description = "In progress..."
progressInfo.description = "Starting ocr process..."

task.ocrProgress = progressInfo.toOcrProgress()
taskRepository.save(task)
try {
for (document in task.inDocuments) {
progressInfo.description = "Starting ocr of documents..."
for (document in task.inDocuments.sortedBy { it.originalFileName }) {
val inFile =
Path.of(ocrProperties.taskPath, task.id.toString(), "input", document.randomizedFileName).toFile()
if (inFile.exists()) {
Expand All @@ -53,6 +56,7 @@ class OcrExecutor(

val filesToOcr = preProcessDocument(task.ocrConfig.preProcessing, inFile)
if (filesToOcr.size > 1) {
progressInfo.description = "Starting ocr of documents..."
logger.info("Starting ocr of multi paged document.")
PDDocument().use { pdDocument ->
filesToOcr.entries.sortedBy { it.key }.forEach { filesToOcrEntry ->
Expand Down Expand Up @@ -81,7 +85,6 @@ class OcrExecutor(
val index = it.key
val fileToOcr = it.value

val tempOutFile = File.createTempFile("___", ".tmp")
tesseract.createDocuments(
fileToOcr.absolutePath,
outFile.absolutePath,
Expand All @@ -91,11 +94,24 @@ class OcrExecutor(
}
progressInfo.taskDone++
}
task.ocrProgress = progressInfo.toOcrProgress()
taskRepository.save(task)
}
if (task.ocrConfig.mergeDocuments) {
progressInfo.description = "Starting merging of documents..."
val mergedFile =
Path.of(ocrProperties.taskPath, task.id.toString(), "output", "merged_" + UUID.randomUUID() + "." + task.ocrConfig.fileFormat.getExtension())
.toFile()

when (task.ocrConfig.fileFormat) {
OcrConfig.FileFormat.TEXT -> {
mergeTextDocuments(mergedFile, task)
}
OcrConfig.FileFormat.PDF -> {
mergePdfDocuments(mergedFile, task)
}
OcrConfig.FileFormat.HOCR -> {
mergeHocrDocuments(mergedFile, task)
}
}
}
progressInfo.progressInfoStatus = ProgressInfo.ProgressInfoStatus.FINISHED
task.ocrProgress = progressInfo.toOcrProgress()
Expand All @@ -106,6 +122,67 @@ class OcrExecutor(
}
}

private fun mergeTextDocuments(mergedFile: File, task: Task) {
BufferedWriter(FileWriter(mergedFile)).use { writer ->
for (document in task.inDocuments.sortedBy { it.originalFileName }) {
val file = Path.of(ocrProperties.taskPath, task.id.toString(), "output", document.outDocument!!.outputFileName + "." + task.ocrConfig.fileFormat.getExtension()).toFile()
BufferedReader(FileReader(file)).use { reader ->
var line: String?
while (reader.readLine().also { line = it } != null) {
writer.write(line)
writer.newLine()
}
}
writer.newLine()
}
}
}

private fun mergeHocrDocuments(mergedFile: File, task: Task) {
val saxParserFactory = SAXParserFactory.newInstance()
val saxParser = saxParserFactory.newSAXParser()
val saxHandler = HocrSaxHandler()

BufferedWriter(FileWriter(mergedFile)).use { writer ->
//writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
//writer.newLine()
// writer.write("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">")
writer.write("<html>")
writer.newLine()
for ((index, document) in task.inDocuments.sortedBy { it.originalFileName }.withIndex()) {
val file = Path.of(ocrProperties.taskPath, task.id.toString(), "output", document.outDocument!!.outputFileName + "." + task.ocrConfig.fileFormat.getExtension()).toFile()

saxParser.parse(file, saxHandler)
if (index == 0) {
writer.write(" <head>")
writer.newLine()
writer.write(saxHandler.head)
writer.write(" </head>")
writer.newLine()
writer.write(" <body>")
}
writer.write(saxHandler.body)
}
writer.newLine()
writer.write(" </body>")
writer.newLine()
writer.write("</html>")
}
}

private fun mergePdfDocuments(mergedFile: File, task: Task) {
PDDocument().use { pdDocument ->
for (document in task.inDocuments.sortedBy { it.originalFileName }) {
val file = Path.of(ocrProperties.taskPath, task.id.toString(), "output", document.outDocument!!.outputFileName + "." + task.ocrConfig.fileFormat.getExtension()).toFile()
val outDocument = Loader.loadPDF(file)
outDocument.pages.forEach { page ->
pdDocument.addPage(page)
}
}
pdDocument.save(mergedFile)
}
}

private data class InputData (
val fileFormat: OcrConfig.FileFormat,
val file: File
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package com.github.nenadjakic.ocr.studio.handler.sax

import org.xml.sax.Attributes
import org.xml.sax.helpers.DefaultHandler

class HocrSaxHandler: DefaultHandler() {
enum class InsideElement { HEAD, BODY, OTHER}
private var insideElement = InsideElement.OTHER

private lateinit var bodyBuilder: StringBuilder
private lateinit var headBuilder: StringBuilder

val body: String
get() = bodyBuilder.toString()

val head: String
get() = headBuilder.toString()


override fun startDocument() {
bodyBuilder = StringBuilder()
headBuilder = StringBuilder()
}

override fun startElement(uri: String?, localName: String?, qName: String?, attributes: Attributes?) {
if (qName.equals("head", true)) {
insideElement = InsideElement.HEAD
} else if (qName.equals("body", true)) {
insideElement = InsideElement.BODY
} else if (insideElement == InsideElement.BODY) {
appendStartElement(bodyBuilder, qName, attributes)
} else if (insideElement == InsideElement.HEAD) {
appendStartElement(headBuilder, qName, attributes)
}
}

override fun endElement(uri: String?, localName: String?, qName: String?) {
if (qName.equals("body", true) || qName.equals("head", true)) {
insideElement = InsideElement.OTHER
} else if (insideElement == InsideElement.BODY) {
bodyBuilder.append("</$qName>")
} else if (insideElement == InsideElement.HEAD) {
headBuilder.append("</$qName>")
}
}

override fun characters(ch: CharArray?, start: Int, length: Int) {
if (insideElement == InsideElement.BODY) {
bodyBuilder.append(ch, start, length)
} else if (insideElement == InsideElement.HEAD) {
headBuilder.append(ch, start, length)
}
}

private fun appendStartElement(stringBuilder: StringBuilder, qName: String?, attributes: Attributes?) {
stringBuilder.append("<$qName")

if (attributes != null && attributes.length > 0) {
for (i in 0 until attributes.length) {
val attributeName = attributes.getQName(i)
val attributeValue = attributes.getValue(i)
if (attributeValue.contains("'")) {
stringBuilder.append(" $attributeName=\"$attributeValue\"")
} else {
stringBuilder.append(" $attributeName='$attributeValue'")
}
}
}
stringBuilder.append(">")
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package com.github.nenadjakic.ocr.studio.service

import com.github.nenadjakic.ocr.studio.config.OcrProperties
import com.github.nenadjakic.ocr.studio.entity.Document
import com.github.nenadjakic.ocr.studio.entity.OcrConfig
import com.github.nenadjakic.ocr.studio.entity.SchedulerConfig
Expand All @@ -17,8 +16,7 @@ import java.util.*
@Service
class TaskService(
private val taskRepository: TaskRepository,
private val taskFileSystemService: TaskFileSystemService,
private val ocrProperties: OcrProperties
private val taskFileSystemService: TaskFileSystemService
) {

fun findAll(): List<Task> = taskRepository.findAll(Sort.by(Sort.Order.asc("id")))
Expand All @@ -37,7 +35,7 @@ class TaskService(

fun insert(entity: Task, files: Collection<MultipartFile>? = emptyList()): Task {
val createdEntity = insert(entity)
if (files != null && !files.isEmpty()) {
if (!files.isNullOrEmpty()) {
upload(createdEntity.id!!, files)
}
return createdEntity
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ logging.level.org.springframework.data.mongodb.core.MongoTemplate=DEBUG
logging.level.com.github.nenadjakic.ocr.studio=DEBUG


ocr.task-path=
ocr.task-path=./data/tasks
ocr.tesseract.data-path=\\\\wsl.localhost\\Ubuntu-22.04\\usr\\share\\tesseract-ocr\\4.00\\tessdata
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,13 @@ class TaskServiceTest {
private lateinit var taskService: TaskService
private lateinit var taskRepository: TaskRepository
private lateinit var taskFileSystemService: TaskFileSystemService
private lateinit var ocrProperties: OcrProperties

@BeforeEach
fun setUp() {
taskRepository = mock(TaskRepository::class.java)
taskFileSystemService = mock(TaskFileSystemService::class.java)
ocrProperties = mock(OcrProperties::class.java)

taskService = TaskService(taskRepository, taskFileSystemService, ocrProperties)
taskService = TaskService(taskRepository, taskFileSystemService)
}

@AfterEach
Expand Down

0 comments on commit a87a5a4

Please sign in to comment.