diff --git a/js/export/export.js b/js/export/export.js index 2279c58..20268aa 100644 --- a/js/export/export.js +++ b/js/export/export.js @@ -9,6 +9,7 @@ import { writePdf } from './pdf/writePdf.js'; import { writeHocr } from './writeHocr.js'; import { writeText } from './writeText.js'; import { writeHtml } from './writeHtml.js'; +import { writeAlto } from './writeAlto.js'; import { removeCircularRefsOcr } from '../objects/ocrObjects.js'; import { removeCircularRefsDataTables } from '../objects/layoutObjects.js'; import { FontCont } from '../containers/fontContainer.js'; @@ -16,7 +17,7 @@ import { FontCont } from '../containers/fontContainer.js'; /** * Export active OCR data to specified format. * @public - * @param {'pdf'|'hocr'|'docx'|'html'|'xlsx'|'txt'|'text'|'scribe'} [format='txt'] + * @param {'pdf'|'hocr'|'alto'|'docx'|'html'|'xlsx'|'txt'|'text'|'scribe'} [format='txt'] * @param {number} [minPage=0] - First page to export. * @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page. * @returns {Promise} @@ -218,6 +219,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { } } else if (format === 'hocr') { content = writeHocr({ ocrData: ocrDownload, minValue: minPage, maxValue: maxPage }); + } else if (format === 'alto') { + content = writeAlto({ ocrData: ocrDownload, minValue: minPage, maxValue: maxPage }); } else if (format === 'html') { const images = /** @type {Array} */ ([]); if (opt.includeImages) { @@ -291,14 +294,15 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { /** * Runs `exportData` and saves the result as a download (browser) or local file (Node.js). * @public - * @param {'pdf'|'hocr'|'docx'|'xlsx'|'txt'|'text'|'html'|'scribe'} format + * @param {'pdf'|'hocr'|'alto'|'docx'|'xlsx'|'txt'|'text'|'html'|'scribe'} format * @param {string} fileName * @param {number} [minPage=0] - First page to export. * @param {number} [maxPage=-1] - Last page to export (inclusive). -1 exports through the last page. */ export async function download(format, fileName, minPage = 0, maxPage = -1) { if (format === 'text') format = 'txt'; - fileName = fileName.replace(/\.\w{1,6}$/, `.${format}`); + const ext = format === 'alto' ? 'xml' : format; + fileName = fileName.replace(/\.\w{1,6}$/, `.${ext}`); const content = await exportData(format, minPage, maxPage); await saveAs(content, fileName); } diff --git a/js/export/writeAlto.js b/js/export/writeAlto.js new file mode 100644 index 0000000..4533922 --- /dev/null +++ b/js/export/writeAlto.js @@ -0,0 +1,313 @@ +import { opt } from '../containers/app.js'; +import { pageMetricsAll } from '../containers/dataContainer.js'; +import ocr from '../objects/ocrObjects.js'; + +/** + * Converts Tesseract language codes to ISO 639-2 codes for ALTO XML + * @param {string} tesseractLang + */ +function tesseractToISO6392(tesseractLang) { + const langMap = { + eng: 'en-US', + fra: 'fr-FR', + deu: 'de-DE', + spa: 'es-ES', + ita: 'it-IT', + por: 'pt-PT', + nld: 'nl-NL', + rus: 'ru-RU', + pol: 'pl-PL', + ces: 'cs-CZ', + slk: 'sk-SK', + ukr: 'uk-UA', + hun: 'hu-HU', + ron: 'ro-RO', + hrv: 'hr-HR', + srp: 'sr-RS', + bul: 'bg-BG', + slv: 'sl-SI', + cat: 'ca-ES', + dan: 'da-DK', + fin: 'fi-FI', + nor: 'no-NO', + swe: 'sv-SE', + tur: 'tr-TR', + ell: 'el-GR', + ara: 'ar-SA', + heb: 'he-IL', + hin: 'hi-IN', + jpn: 'ja-JP', + kor: 'ko-KR', + chi_sim: 'zh-CN', + chi_tra: 'zh-TW', + tha: 'th-TH', + vie: 'vi-VN', + }; + return langMap[tesseractLang] || tesseractLang; +} + +/** + * Exports OCR data to ALTO XML format (v2.0) + * @param {Object} params + * @param {Array} params.ocrData - OCR data to export + * @param {number} [params.minValue] - First page to export (inclusive) + * @param {number} [params.maxValue] - Last page to export (inclusive) + * @returns {string} ALTO XML formatted string + */ +export function writeAlto({ ocrData, minValue, maxValue }) { + if (minValue === null || minValue === undefined) minValue = 0; + if (maxValue === null || maxValue === undefined || maxValue < 0) maxValue = ocrData.length - 1; + + const stylesMap = new Map(); + let styleIdCounter = 0; + + /** + * Get or create a style ID for a given font family and size + * @param {string} fontFamily + * @param {number} fontSize + * @returns {string} + */ + function getStyleId(fontFamily, fontSize) { + const key = `${fontFamily || 'Default'}_${fontSize || 10}`; + if (!stylesMap.has(key)) { + const styleId = `font${styleIdCounter++}`; + stylesMap.set(key, { id: styleId, fontFamily: fontFamily || 'Default', fontSize: fontSize || 10 }); + } + return stylesMap.get(key).id; + } + + for (let i = minValue; i <= maxValue; i++) { + const pageObj = ocrData[i]; + if (!pageObj) continue; + + for (const lineObj of pageObj.lines) { + for (const wordObj of lineObj.words) { + if (wordObj.style.font || wordObj.style.size) { + getStyleId(wordObj.style.font, wordObj.style.size); + } + } + } + } + + let altoOut = '\n'; + altoOut += '${today}`; + altoOut += ''; + altoOut += 'scribeocr'; + altoOut += 'scribe.js'; + altoOut += ''; + altoOut += '\n'; + altoOut += '\n'; + + if (stylesMap.size > 0) { + altoOut += ''; + for (const [, style] of stylesMap) { + altoOut += ``; + } + altoOut += '\n\n'; + } + + altoOut += '\n'; + + for (let pageIndex = minValue; pageIndex <= maxValue; pageIndex++) { + const pageObj = ocrData[pageIndex]; + + let pageHeight = 0; + let pageWidth = 0; + if (pageObj) { + pageHeight = pageObj.dims.height; + pageWidth = pageObj.dims.width; + } else if (pageMetricsAll[pageIndex]) { + pageHeight = pageMetricsAll[pageIndex].dims.height; + pageWidth = pageMetricsAll[pageIndex].dims.width; + } + + altoOut += `\n`; + + if (!pageObj || pageObj.lines.length === 0) { + altoOut += '\n'; + continue; + } + + altoOut += `\n`; + + let parCurrent = null; + let blockIndex = 0; + let blockStyleRef = null; + let blockLang = null; + + for (let lineIndex = 0; lineIndex < pageObj.lines.length; lineIndex++) { + const lineObj = pageObj.lines[lineIndex]; + + if (lineObj.words.length === 0) continue; + + if (blockIndex === 0 || lineObj.par !== parCurrent) { + if (blockIndex > 0) { + altoOut += '\n'; + } + + parCurrent = lineObj.par; + + let blockLeft = Math.round(lineObj.bbox.left); + let blockTop = Math.round(lineObj.bbox.top); + let blockRight = Math.round(lineObj.bbox.right); + let blockBottom = Math.round(lineObj.bbox.bottom); + + const blockStyleCounts = new Map(); + const blockLangCounts = new Map(); + for (let j = lineIndex; j < pageObj.lines.length; j++) { + const nextLine = pageObj.lines[j]; + if (nextLine.words.length === 0) continue; + if (j > lineIndex && nextLine.par !== parCurrent) break; + + if (j > lineIndex) { + blockLeft = Math.min(blockLeft, Math.round(nextLine.bbox.left)); + blockTop = Math.min(blockTop, Math.round(nextLine.bbox.top)); + blockRight = Math.max(blockRight, Math.round(nextLine.bbox.right)); + blockBottom = Math.max(blockBottom, Math.round(nextLine.bbox.bottom)); + } + + for (const word of nextLine.words) { + if (word.style.font || word.style.size) { + const styleId = getStyleId(word.style.font || '', word.style.size || 0); + blockStyleCounts.set(styleId, (blockStyleCounts.get(styleId) || 0) + 1); + } + if (word.lang) { + blockLangCounts.set(word.lang, (blockLangCounts.get(word.lang) || 0) + 1); + } + } + } + + blockStyleRef = null; + let maxCount = 0; + for (const [styleId, count] of blockStyleCounts) { + if (count > maxCount) { + maxCount = count; + blockStyleRef = styleId; + } + } + + blockLang = null; + let maxLangCount = 0; + for (const [lang, count] of blockLangCounts) { + if (count > maxLangCount) { + maxLangCount = count; + blockLang = lang; + } + } + + const blockWidth = blockRight - blockLeft; + const blockHeight = blockBottom - blockTop; + + altoOut += ``; + + for (let wordIndex = 0; wordIndex < lineObj.words.length; wordIndex++) { + const wordObj = lineObj.words[wordIndex]; + + const wordLeft = Math.round(wordObj.bbox.left); + const wordTop = Math.round(wordObj.bbox.top); + const wordRight = Math.round(wordObj.bbox.right); + const wordBottom = Math.round(wordObj.bbox.bottom); + const wordWidth = wordRight - wordLeft; + const wordHeight = wordBottom - wordTop; + + let styleAttr = ''; + const styleAttrs = []; + if (wordObj.style.bold) styleAttrs.push('bold'); + if (wordObj.style.italic) styleAttrs.push('italic'); + if (wordObj.style.underline) styleAttrs.push('underline'); + if (wordObj.style.sup) styleAttrs.push('superscript'); + if (wordObj.style.smallCaps) styleAttrs.push('smallCaps'); + + if (styleAttrs.length > 0) { + styleAttr = ` STYLE="${styleAttrs.join(' ')}"`; + } + + let styleRefsAttr = ''; + if (wordObj.style.font || wordObj.style.size) { + const styleId = getStyleId(wordObj.style.font || '', wordObj.style.size || 0); + // Only add STYLEREFS if it differs from the block-level style + if (styleId !== blockStyleRef) { + styleRefsAttr = ` STYLEREFS="${styleId}"`; + } + } + + let langAttr = ''; + if (wordObj.lang) { + // Only add language if it differs from the block-level language + if (wordObj.lang !== blockLang) { + langAttr = ` language="${tesseractToISO6392(wordObj.lang)}"`; + } + } + + let wcAttr = ''; + if (wordObj.conf !== undefined && wordObj.conf !== null) { + const confNormalized = wordObj.conf / 100; + wcAttr = ` WC="${confNormalized.toFixed(2)}"`; + } + + altoOut += ``; + + // The ALTO XML format uses explicit SP elements to denote spaces between words. + // While this seems redundant if we understand each element to represent a word, + // it is encouraged by Library of Congress standards. + // "The use of SP and HYP are encouraged" + // https://www.loc.gov/ndnp/guidelines/NDNP_202628TechNotes.pdf + if (wordIndex < lineObj.words.length - 1) { + const nextWord = lineObj.words[wordIndex + 1]; + const spaceWidth = Math.round(nextWord.bbox.left) - wordRight - 2; + if (spaceWidth > 0) { + altoOut += ``; + } + } + } + + altoOut += '\n'; + } + + altoOut += '\n'; + + altoOut += '\n'; + altoOut += '\n'; + + opt.progressHandler({ n: pageIndex, type: 'export', info: {} }); + } + + altoOut += '\n'; + altoOut += '\n'; + + return altoOut; +} diff --git a/js/export/writeDocx.js b/js/export/writeDocx.js index c0d186c..3199ef3 100644 --- a/js/export/writeDocx.js +++ b/js/export/writeDocx.js @@ -32,8 +32,7 @@ export function writeDocxContent({ const pageObj = ocrCurrent[g]; - // Do not overwrite paragraphs from Abbyy or Textract. - if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) { + if (reflowText && (!pageObj.textSource || !['textract', 'abbyy', 'google_vision', 'azure_doc_intel', 'docx'].includes(pageObj.textSource))) { const angle = pageMetricsAll[g].angle || 0; assignParagraphs(pageObj, angle); } @@ -78,6 +77,10 @@ export function writeDocxContent({ fontStyle += ''; } + if (wordObj.style.font) { + fontStyle += ``; + } + if (newLine || fontStyle !== fontStylePrev || (h === 0 && g === 0 && i === 0)) { const styleStr = fontStyle === '' ? '' : `${fontStyle}`; diff --git a/js/export/writeText.js b/js/export/writeText.js index f408a08..f3961bf 100644 --- a/js/export/writeText.js +++ b/js/export/writeText.js @@ -27,8 +27,7 @@ export function writeText({ const pageObj = ocrCurrent[g]; - // Do not overwrite paragraphs from Abbyy or Textract. - if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) { + if (reflowText && (!pageObj.textSource || !['textract', 'abbyy', 'google_vision', 'azure_doc_intel', 'docx'].includes(pageObj.textSource))) { const angle = pageMetricsAll[g].angle || 0; assignParagraphs(pageObj, angle); } diff --git a/js/generalWorkerMain.js b/js/generalWorkerMain.js index 62e357e..b1a6163 100644 --- a/js/generalWorkerMain.js +++ b/js/generalWorkerMain.js @@ -100,6 +100,7 @@ export async function initGeneralWorker() { obj.convertDocAzureDocIntel = wrap('convertDocAzureDocIntel'); obj.convertPageGoogleVision = wrap('convertPageGoogleVision'); obj.convertPageText = wrap('convertPageText'); + obj.convertDocDocx = wrap('convertDocDocx'); obj.optimizeFont = wrap('optimizeFont'); @@ -212,6 +213,15 @@ export class gs { return gs.schedulerInner.addJob('convertDocAzureDocIntel', args); }; + /** + * @param {Parameters[0]} args + * @returns {ReturnType} + */ + static convertDocDocx = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertDocDocx', args); + }; + /** * @param {Parameters[0]} args * @returns {ReturnType} diff --git a/js/global.d.ts b/js/global.d.ts index 23cc267..9d20d81 100644 --- a/js/global.d.ts +++ b/js/global.d.ts @@ -13,7 +13,7 @@ declare global { // Strings representing supported sources of text. // `stext` indicates the text was extracted directly from a PDF using mupdf. - type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'alto' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel'; + type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'alto' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel' | 'docx'; type FontState = { enableOpt: boolean; diff --git a/js/import/convertDocDocx.js b/js/import/convertDocDocx.js new file mode 100644 index 0000000..1c3f44e --- /dev/null +++ b/js/import/convertDocDocx.js @@ -0,0 +1,337 @@ +import ocr from '../objects/ocrObjects.js'; +import { LayoutDataTablePage } from '../objects/layoutObjects.js'; +import { calcWordCharMetrics } from '../utils/fontUtils.js'; +import { FontCont } from '../containers/fontContainer.js'; +import { unescapeXml } from '../utils/miscUtils.js'; + +const FONT_FAMILY = 'Times New Roman'; +const FONT_SIZE = 14; +const CHAR_SPACING = 0; +const WORD_SPACING = 0; +const LINE_HEIGHT = 14.4; +const MARGIN_VERTICAL = 30; +const MARGIN_HORIZONTAL = 20; + +/** @type {?opentype.Font} */ +let fontOpentype = null; + +/** + * Calculates the advance of a string in pixels. + * @param {string} text + * @param {number} size + * @param {opentype.Font} font + */ +function getTextWidth(text, size, font) { + const { advanceArr, kerningArr } = calcWordCharMetrics(text, font); + + const advanceTotal = advanceArr.reduce((a, b) => a + b, 0); + const kerningTotal = kerningArr.reduce((a, b) => a + b, 0); + + const wordLastGlyphMetrics = font.charToGlyph(text.at(-1)).getMetrics(); + const wordFirstGlyphMetrics = font.charToGlyph(text[0]).getMetrics(); + + const wordLeftBearing = wordFirstGlyphMetrics.xMin || 0; + const lastGlyphMax = wordLastGlyphMetrics.xMax || 0; + const wordRightBearing = advanceArr[advanceArr.length - 1] - lastGlyphMax; + + const wordWidth1 = (advanceTotal + kerningTotal - wordLeftBearing - wordRightBearing); + const wordWidth1Px = wordWidth1 * (size / font.unitsPerEm); + const spacingTotalPx = (text.length - 1) * CHAR_SPACING; + const wordWidth = wordWidth1Px + spacingTotalPx; + + return wordWidth; +} + +/** + * Parse XML text content from a docx run element + * @param {string} runXml - XML string of a element + * @returns {{text: string, styles: {bold: boolean, italic: boolean, smallCaps: boolean, underline: boolean, sup: boolean, font: string | null}}} + */ +function parseRunElement(runXml) { + const styles = { + bold: //.test(runXml) || //.test(runXml) || //.test(runXml) || //.test(runXml) && !/ element + const fontMatch = runXml.match(/]*w:ascii="([^"]+)"/); + if (fontMatch) { + styles.font = unescapeXml(fontMatch[1]); + } else { + // Try w:hAnsi if ascii not found + const fontMatchHAnsi = runXml.match(/]*w:hAnsi="([^"]+)"/); + if (fontMatchHAnsi) { + styles.font = unescapeXml(fontMatchHAnsi[1]); + } + } + + const textMatches = runXml.matchAll(/]*>([^<]*)<\/w:t>/g); + let text = ''; + for (const match of textMatches) { + text += unescapeXml(match[1]); + } + + return { text, styles }; +} + +/** + * Parse paragraphs from docx document.xml content + * @param {string} docXml - The content of word/document.xml + * @returns {Array>} + */ +function parseParagraphs(docXml) { + const paragraphs = []; + + const paragraphMatches = docXml.matchAll(/]*>(.*?)<\/w:p>/gs); + + for (const parMatch of paragraphMatches) { + const parContent = parMatch[1]; + const runs = []; + + const runMatches = parContent.matchAll(/]*>(.*?)<\/w:r>/gs); + + for (const runMatch of runMatches) { + const runContent = runMatch[1]; + const parsed = parseRunElement(runContent); + + if (parsed.text) { + runs.push(parsed); + } + } + + if (runs.length > 0) { + paragraphs.push(runs); + } + } + + return paragraphs; +} + +/** + * Convert a docx file to internal OCR format + * @param {Object} params + * @param {ArrayBuffer} params.docxData - The docx file data + * @param {?{width: number, height: number}} [params.pageDims] - Page dimensions (will be calculated if not provided) + */ +export async function convertDocDocx({ docxData, pageDims = null }) { + const { BlobReader, BlobWriter, ZipReader } = await import('../../lib/zip.js/index.js'); + + const blob = new Blob([docxData]); + + const zipReader = new ZipReader(new BlobReader(blob)); + const entries = await zipReader.getEntries(); + + const documentEntry = entries.find((entry) => entry.filename === 'word/document.xml'); + if (!documentEntry) { + throw new Error('No word/document.xml found in docx file'); + } + + const writer = new BlobWriter(); + await documentEntry.getData(writer); + const documentBlob = await writer.getData(); + const documentXml = await documentBlob.text(); + + await zipReader.close(); + + const pagesOut = await convertDocumentXML({ documentXml, pageDims }); + + return pagesOut; +} + +/** + * Convert a docx file to internal OCR format + * @param {Object} params + * @param {string} params.documentXml + * @param {?{width: number, height: number}} [params.pageDims] - Page dimensions (will be calculated if not provided) + */ +const convertDocumentXML = async ({ documentXml, pageDims = null }) => { + if (!fontOpentype) { + fontOpentype = (await FontCont.getFont({ font: FONT_FAMILY })).opentype; + } + + const ASCENDER_HEIGHT = fontOpentype.ascender * (FONT_SIZE / fontOpentype.unitsPerEm); + const DESCENDER_HEIGHT = fontOpentype.descender * (FONT_SIZE / fontOpentype.unitsPerEm); + + if (!pageDims) { + pageDims = { width: 612, height: 792 }; + } + + const paragraphs = parseParagraphs(documentXml); + + const pagesOut = []; + let pageIndex = 0; + let pageObj = new ocr.OcrPage(pageIndex, pageDims); + pageObj.textSource = 'docx'; + let tablesPage = new LayoutDataTablePage(0); + pagesOut.push({ pageObj, dataTables: tablesPage }); + + const availableWidth = pageDims.width - MARGIN_HORIZONTAL * 2; + let currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2; + + for (const paragraph of paragraphs) { + const parLines = []; + let parRight = MARGIN_HORIZONTAL; + let runIndex = 0; + let charIndexInRun = 0; + + while (runIndex < paragraph.length) { + if (currentY + FONT_SIZE > pageDims.height - MARGIN_VERTICAL) { + if (parLines.length > 0) { + const parBbox = { + left: MARGIN_HORIZONTAL, + top: parLines[0].bbox.top, + right: parRight, + bottom: parLines[parLines.length - 1].bbox.bottom, + }; + const parObj = new ocr.OcrPar(pageObj, parBbox); + parObj.lines = parLines; + for (const ln of parLines) ln.par = parObj; + pageObj.pars.push(parObj); + parLines.length = 0; + parRight = MARGIN_HORIZONTAL; + } + pageIndex++; + const newPage = new ocr.OcrPage(pageIndex, pageDims); + newPage.textSource = 'docx'; + const newTables = new LayoutDataTablePage(pageIndex); + pagesOut.push({ pageObj: newPage, dataTables: newTables }); + pageObj = newPage; + tablesPage = newTables; + currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2; + } + + const baseline = [0, DESCENDER_HEIGHT]; + const lineTop = Math.round(currentY - ASCENDER_HEIGHT); + const lineBottom = Math.round(currentY + DESCENDER_HEIGHT); + + const lineBbox = { + left: MARGIN_HORIZONTAL, + top: lineTop, + right: MARGIN_HORIZONTAL, + bottom: lineBottom, + }; + const lineObj = new ocr.OcrLine( + pageObj, + lineBbox, + baseline, + ASCENDER_HEIGHT, + null, + ); + + let currentX = MARGIN_HORIZONTAL; + let lineComplete = false; + let lastItemWasWhitespace = false; + + while (runIndex < paragraph.length && !lineComplete) { + const run = paragraph[runIndex]; + const remainingText = run.text.substring(charIndexInRun); + + const words = remainingText.split(/(\s+)/); + + for (let wordIdx = 0; wordIdx < words.length; wordIdx++) { + const word = words[wordIdx]; + if (word.length === 0) continue; + + const isWhitespace = /^\s+$/.test(word); + + if (isWhitespace) { + const spaceWidth = getTextWidth(' ', FONT_SIZE, fontOpentype) + WORD_SPACING; + currentX += spaceWidth * word.length; + charIndexInRun += word.length; + lastItemWasWhitespace = true; + } else { + // Check if we should append to the previous word (word continues across runs) + // Only append if: we're at the start of a new run AND the last item was NOT whitespace + const lastWord = lineObj.words[lineObj.words.length - 1]; + const shouldAppend = lastWord && wordIdx === 0 && charIndexInRun === 0 && !lastItemWasWhitespace; + + if (shouldAppend) { + const combinedText = lastWord.text + word; + const combinedWidth = getTextWidth(combinedText, FONT_SIZE, fontOpentype); + + if (lastWord.bbox.left + combinedWidth > MARGIN_HORIZONTAL + availableWidth) { + lineComplete = true; + break; + } + + lastWord.text = combinedText; + lastWord.bbox.right = Math.round(lastWord.bbox.left + combinedWidth); + currentX = lastWord.bbox.right; + charIndexInRun += word.length; + } else { + const wordWidth = getTextWidth(word, FONT_SIZE, fontOpentype); + + if (lineObj.words.length > 0 && currentX + wordWidth > MARGIN_HORIZONTAL + availableWidth) { + lineComplete = true; + break; + } + + const wordBbox = { + left: Math.round(currentX), + top: lineTop, + right: Math.round(currentX + wordWidth), + bottom: lineBottom, + }; + const wordId = `word_${pageIndex + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`; + const wordObj = new ocr.OcrWord(lineObj, wordId, word, wordBbox); + wordObj.conf = 100; + wordObj.style.font = run.styles.font || FONT_FAMILY; + + wordObj.style.bold = run.styles.bold; + wordObj.style.italic = run.styles.italic; + wordObj.style.smallCaps = run.styles.smallCaps; + wordObj.style.underline = run.styles.underline; + wordObj.style.sup = run.styles.sup; + + lineObj.words.push(wordObj); + currentX += wordWidth; + charIndexInRun += word.length; + } + lastItemWasWhitespace = false; + } + } + + if (charIndexInRun >= run.text.length) { + runIndex++; + charIndexInRun = 0; + } + + if (lineComplete) break; + } + + if (lineObj.words.length > 0) { + lineObj.bbox = { + left: lineObj.words[0].bbox.left, + top: lineTop, + right: lineObj.words[lineObj.words.length - 1].bbox.right, + bottom: lineBottom, + }; + + pageObj.lines.push(lineObj); + parLines.push(lineObj); + parRight = Math.max(parRight, lineObj.bbox.right); + } + + currentY += LINE_HEIGHT; + } + + if (parLines.length > 0) { + const parBbox = { + left: MARGIN_HORIZONTAL, + top: parLines[0].bbox.top, + right: parRight, + bottom: parLines[parLines.length - 1].bbox.bottom, + }; + const parObj = new ocr.OcrPar(pageObj, parBbox); + parObj.lines = parLines; + for (const ln of parLines) ln.par = parObj; + pageObj.pars.push(parObj); + } + } + + return pagesOut; +}; diff --git a/js/import/convertPageAbbyy.js b/js/import/convertPageAbbyy.js index 75c9933..b14765f 100644 --- a/js/import/convertPageAbbyy.js +++ b/js/import/convertPageAbbyy.js @@ -85,9 +85,6 @@ export async function convertPageAbbyy({ ocrStr, n }) { /** * Convert Abbyy XML paragraph to internal format. - * Note that Abbyy XML paragraphs are not preserved because paragraphs are re-assigned by the `assignParagraphs` function. - * Even if this function call was skipped in the code, when saving/restoring the state using .scribe files, paragraph data is not saved. - * Further development would be needed to preserve paragraph data. * @param {string} xmlPar */ function convertParAbbyy(xmlPar) { diff --git a/js/import/convertPageAlto.js b/js/import/convertPageAlto.js index ed40332..65b32ab 100644 --- a/js/import/convertPageAlto.js +++ b/js/import/convertPageAlto.js @@ -1,6 +1,7 @@ import ocr from '../objects/ocrObjects.js'; import { + calcBboxUnion, unescapeXml, } from '../utils/miscUtils.js'; @@ -115,7 +116,6 @@ export async function convertPageAlto({ ocrStr, n }) { wordObj.conf = Math.round(parseFloat(wcStr) * 100); } - // Parse style attributes const styleAttr = getAttr(contentMatch, 'STYLE'); if (styleAttr) { if (/bold/i.test(styleAttr)) wordObj.style.bold = true; @@ -129,7 +129,6 @@ export async function convertPageAlto({ ocrStr, n }) { // Use String's STYLEREFS first, fall back to TextBlock's STYLEREFS const styleRefs = getAttr(contentMatch, 'STYLEREFS') || blockStyleRefs; if (styleRefs) { - // Look up the TextStyle definition in the document const styleRegex = new RegExp(`]*>`, 'i'); const styleMatch = ocrStr.match(styleRegex); if (styleMatch) { @@ -161,9 +160,28 @@ export async function convertPageAlto({ ocrStr, n }) { const blockStyleRefs = blockTag ? getAttr(blockTag, 'STYLEREFS') : null; const blockContent = blockMatch[1]; + /** @type {Array} */ + const parLineArr = []; + const textLinesInBlock = [...blockContent.matchAll(textLineRegex)]; for (const lineMatch of textLinesInBlock) { + const lineCountBefore = pageObj.lines.length; convertLine(lineMatch[0], blockStyleRefs); + if (pageObj.lines.length > lineCountBefore) { + parLineArr.push(pageObj.lines[pageObj.lines.length - 1]); + } + } + + if (parLineArr.length > 0) { + const parbox = calcBboxUnion(parLineArr.map((x) => x.bbox)); + const parObj = new ocr.OcrPar(pageObj, parbox); + + parLineArr.forEach((x) => { + x.par = parObj; + }); + + parObj.lines = parLineArr; + pageObj.pars.push(parObj); } } diff --git a/js/import/import.js b/js/import/import.js index efbb0ea..686bfd0 100644 --- a/js/import/import.js +++ b/js/import/import.js @@ -104,7 +104,7 @@ export async function sortInputFiles(files) { if (['png', 'jpeg', 'jpg'].includes(fileExt)) { imageFilesAll.push(file); // All .gz files are assumed to be OCR data (xml) since all other file types can be compressed already - } else if (['hocr', 'xml', 'html', 'gz', 'stext', 'json', 'txt'].includes(fileExt)) { + } else if (['hocr', 'xml', 'html', 'gz', 'stext', 'json', 'txt', 'docx'].includes(fileExt)) { ocrFilesAll.push(file); } else if (['scribe'].includes(fileExt)) { scribeFilesAll.push(file); @@ -400,7 +400,7 @@ export async function importFiles(files) { format = /** @type {("hocr" | "abbyy" | "alto" | "stext" | "textract" | "text")} */ (ocrData.format); // The text import function requires built-in fonts to be loaded. - if (format === 'text') { + if (['text', 'docx'].includes(format)) { await loadBuiltInFontsRaw(); } diff --git a/js/import/importOCR.js b/js/import/importOCR.js index 5a2b197..e287d3d 100644 --- a/js/import/importOCR.js +++ b/js/import/importOCR.js @@ -67,6 +67,10 @@ const detectOcrFormat = (ocrStr, ext) => { return 'text'; } + if (ext && ext.toLowerCase() === 'docx') { + return 'docx'; + } + return null; }; @@ -98,9 +102,15 @@ export async function importOCRFiles(ocrFilesAll) { let serifFont; if (singleHOCRMode) { - const hocrStrAll = await readOcrFile(ocrFilesAll[0]); - - format = detectOcrFormat(hocrStrAll, ocrFilesAll[0]?.name?.split('.').pop()); + const fileExt = ocrFilesAll[0]?.name?.split('.').pop(); + let ocrFilesContent; + if (fileExt === 'docx') { + ocrFilesContent = await ocrFilesAll[0].arrayBuffer(); + format = 'docx'; + } else { + ocrFilesContent = await readOcrFile(ocrFilesAll[0]); + format = detectOcrFormat(ocrFilesContent, fileExt); + } if (!format) { console.error(ocrFilesAll[0]); @@ -108,37 +118,40 @@ export async function importOCRFiles(ocrFilesAll) { } if (format === 'textract') { - hocrRaw = [hocrStrAll]; + hocrRaw = [ocrFilesContent]; } else if (format === 'google_vision') { - hocrRaw = [hocrStrAll]; - if (hocrStrAll.substring(0, 500).includes('"responses"')) { - const responses = JSON.parse(hocrStrAll).responses; + hocrRaw = [ocrFilesContent]; + if (ocrFilesContent.substring(0, 500).includes('"responses"')) { + const responses = JSON.parse(ocrFilesContent).responses; hocrRaw = responses .sort((a, b) => a.context.pageNumber - b.context.pageNumber) .map((resp) => JSON.stringify(resp)); } } else if (format === 'azure_doc_intel') { - hocrRaw = [hocrStrAll]; + hocrRaw = [ocrFilesContent]; } else if (format === 'alto') { // Extract the Styles section to prepend to each page - const stylesMatch = hocrStrAll.match(/[\s\S]*?<\/Styles>/i); + const stylesMatch = ocrFilesContent.match(/[\s\S]*?<\/Styles>/i); const stylesSection = stylesMatch ? stylesMatch[0] : ''; // Split by Page elements - const pages = hocrStrAll.split(/(?= stylesSection + page); } else if (format === 'abbyy') { - hocrRaw = hocrStrAll.split(/(?=/)?.[0]; - hocrRaw = splitHOCRStr(hocrStrAll); + hocrStrStart = ocrFilesContent.match(/[\s\S]*?/)?.[0]; + hocrRaw = splitHOCRStr(ocrFilesContent); } pageCountHOCR = hocrRaw.length; diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js index 142ea66..d5e69be 100644 --- a/js/recognizeConvert.js +++ b/js/recognizeConvert.js @@ -314,6 +314,9 @@ export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, sc res = await gs.convertPageStext({ ocrStr: ocrRaw, n }); } else if (format === 'text') { res = await gs.convertPageText({ textStr: ocrRaw }); + } else if (format === 'docx') { + console.error('format does not support page-level import.'); + // res = await gs.convertDocDocx({ docxData: ocrRaw }); } else { throw new Error(`Invalid format: ${format}`); } @@ -421,6 +424,25 @@ export async function convertOCR(ocrRawArr, mainData, format, engineName, scribe return; } + if (format === 'docx') { + const res = await gs.convertDocDocx({ docxData: ocrRawArr[0] }); + + if (res.length > inputData.pageCount) inputData.pageCount = res.length; + + for (let i = 0; i < res.length; i++) { + if (!layoutRegions.pages[i]) layoutRegions.pages[i] = new LayoutPage(i); + } + + for (let i = 0; i < res.length; i++) { + if (!layoutDataTables.pages[i]) layoutDataTables.pages[i] = new LayoutDataTablePage(i); + } + + for (let n = 0; n < res.length; n++) { + await convertPageCallback(res[n], n, mainData, engineName); + } + return; + } + for (let n = 0; n < ocrRawArr.length; n++) { promiseArr.push(convertOCRPage(ocrRawArr[n], n, mainData, format, engineName, scribeMode)); } diff --git a/js/worker/generalWorker.js b/js/worker/generalWorker.js index 9dacda9..1d93d2e 100644 --- a/js/worker/generalWorker.js +++ b/js/worker/generalWorker.js @@ -7,6 +7,7 @@ import { convertDocTextract } from '../import/convertDocTextract.js'; import { convertDocAzureDocIntel } from '../import/convertDocAzureDocIntel.js'; import { convertPageGoogleVision } from '../import/convertPageGoogleVision.js'; import { convertPageText } from '../import/convertPageText.js'; +import { convertDocDocx } from '../import/convertDocDocx.js'; import { FontCont, loadFontsFromSource } from '../containers/fontContainer.js'; import { @@ -409,6 +410,7 @@ const handleMessage = async (data) => { convertPageGoogleVision, convertPageBlocks, convertPageText, + convertDocDocx, // Optimize font functions optimizeFont, diff --git a/tests/assets/simple_paragraph.alto.xml b/tests/assets/simple_paragraph.alto.xml new file mode 100644 index 0000000..272283e --- /dev/null +++ b/tests/assets/simple_paragraph.alto.xml @@ -0,0 +1,39 @@ + + + +pixel +2025-12-06ABBYYABBYY FineReader Engine12 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/module/exportAlto.spec.js b/tests/module/exportAlto.spec.js new file mode 100644 index 0000000..730b058 --- /dev/null +++ b/tests/module/exportAlto.spec.js @@ -0,0 +1,187 @@ +// Relative imports are required to run in browser. +/* eslint-disable import/no-relative-packages */ +import { assert, config } from '../../node_modules/chai/chai.js'; +import { writeAlto } from '../../js/export/writeAlto.js'; +import scribe from '../../scribe.js'; +import { ASSETS_PATH_KARMA } from '../constants.js'; + +config.truncateThreshold = 0; // Disable truncation for actual/expected values on assertion failure. + +/** + * Reads a text file in any environment (browser or Node.js). + * @param {string} filePath + */ +async function readTextFileUniversal(filePath) { + if (typeof process !== 'undefined') { + const { promises: fsPromises } = await import('node:fs'); + const contents = await fsPromises.readFile(filePath, 'utf-8'); + return contents; + } + + const response = await fetch(filePath); + if (!response.ok) { + throw new Error(`Failed to fetch file: ${filePath}`); + } + return await response.text(); +} + +/** + * Function to normalize and extract content for comparison + * @param {string} xmlStr + */ +const normalizeAlto = (xmlStr) => { + xmlStr = xmlStr.replace(/[^<]*<\/processingDateTime>/g, ''); + xmlStr = xmlStr.replace(/[^<]*<\/softwareCreator>/g, ''); + xmlStr = xmlStr.replace(/[^<]*<\/softwareName>/g, ''); + xmlStr = xmlStr.replace(/[^<]*<\/softwareVersion>/g, ''); + + xmlStr = xmlStr.replace(/]*>\s*<\/TopMargin>/g, ''); + xmlStr = xmlStr.replace(/]*>\s*<\/LeftMargin>/g, ''); + xmlStr = xmlStr.replace(/]*>\s*<\/RightMargin>/g, ''); + xmlStr = xmlStr.replace(/]*>\s*<\/BottomMargin>/g, ''); + + xmlStr = xmlStr.replace(/]*>/g, ''); + + // Remove position attributes from TextBlock. + // Some of the test data is from Abbyy, + // which does not tightly enclose TextBlock elements around text content. + // Therefore, position attributes may differ after re-exporting. + xmlStr = xmlStr.replace(/]*)>/g, (_match, attrs) => { + const newAttrs = attrs.replace(/\s*VPOS="[^"]*"\s*/g, ' ') + .replace(/\s*HPOS="[^"]*"\s*/g, ' ') + .replace(/\s*WIDTH="[^"]*"\s*/g, ' ') + .replace(/\s*HEIGHT="[^"]*"\s*/g, ' ') + .trim(); + return ``; + }); + + // Normalize confidence values to 2 decimal places + xmlStr = xmlStr.replace(/WC="(\d+(?:\.\d+)?)"/g, (_match, value) => { + const numValue = parseFloat(value); + if (Number.isNaN(numValue)) { + throw new Error(`Invalid WC value: "${value}" cannot be parsed to a number`); + } + return `WC="${numValue.toFixed(2)}"`; + }); + + xmlStr = xmlStr.replace(/]*)\/>/g, (_match, attrs) => { + const newAttrs = attrs.replace(/\s*VPOS="[^"]*"\s*/g, ' ').trim(); + return ``; + }); + + xmlStr = xmlStr.replace(/\s+/g, ' ').trim(); + xmlStr = xmlStr.replace(/>/g, '>\n'); + return xmlStr; +}; + +// Using arrow functions breaks references to `this`. +/* eslint-disable prefer-arrow-callback */ +/* eslint-disable func-names */ + +describe('Check .alto export function.', function () { + this.timeout(10000); + + it('Should correctly export and reimport text content', async () => { + await scribe.terminate(); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/the_past.alto.xml`]); + + const text1Before = scribe.data.ocr.active[0].lines[0].words.map((x) => x.text).join(' '); + const text3Before = scribe.data.ocr.active[0].lines[2].words.map((x) => x.text).join(' '); + + const altoOutStr = writeAlto({ ocrData: scribe.data.ocr.active }); + + const encoder = new TextEncoder(); + const encoded = encoder.encode(altoOutStr); + + await scribe.terminate(); + await scribe.importFiles({ ocrFiles: [encoded.buffer] }); + + const text1After = scribe.data.ocr.active[0].lines[0].words.map((x) => x.text).join(' '); + const text3After = scribe.data.ocr.active[0].lines[2].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1Before, text1After); + assert.strictEqual(text3Before, text3After); + }).timeout(10000); + + it('Should correctly export and reimport confidence scores', async () => { + await scribe.terminate(); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/the_past.alto.xml`]); + + const word1Before = scribe.data.ocr.active[0].lines[0].words[0]; + const word2Before = scribe.data.ocr.active[0].lines[0].words[1]; + const conf1Before = word1Before.conf; + const conf2Before = word2Before.conf; + + const altoOutStr = writeAlto({ ocrData: scribe.data.ocr.active }); + + const encoder = new TextEncoder(); + const encoded = encoder.encode(altoOutStr); + + await scribe.terminate(); + await scribe.importFiles({ ocrFiles: [encoded.buffer] }); + + const word1After = scribe.data.ocr.active[0].lines[0].words[0]; + const word2After = scribe.data.ocr.active[0].lines[0].words[1]; + + assert.approximately(word1After.conf, conf1Before, 1, 'Word 1 confidence should be approximately the same'); + assert.approximately(word2After.conf, conf2Before, 1, 'Word 2 confidence should be approximately the same'); + }).timeout(10000); + + it('Should correctly export and reimport font styles', async () => { + await scribe.terminate(); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/the_past.alto.xml`]); + + const boldBefore1 = scribe.data.ocr.active[0].lines[0].words[0].style.bold; + const boldBefore2 = scribe.data.ocr.active[0].lines[0].words[1].style.bold; + + const altoOutStr = writeAlto({ ocrData: scribe.data.ocr.active }); + + const encoder = new TextEncoder(); + const encoded = encoder.encode(altoOutStr); + + await scribe.terminate(); + await scribe.importFiles({ ocrFiles: [encoded.buffer] }); + + const boldAfter1 = scribe.data.ocr.active[0].lines[0].words[0].style.bold; + const boldAfter2 = scribe.data.ocr.active[0].lines[0].words[1].style.bold; + + assert.strictEqual(boldBefore1, boldAfter1, 'Word 1 bold style should be preserved'); + assert.strictEqual(boldBefore2, boldAfter2, 'Word 2 bold style should be preserved'); + }).timeout(10000); + + it('Should correctly export and reimport font family', async () => { + await scribe.terminate(); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/the_past.alto.xml`]); + + const fontBefore = scribe.data.ocr.active[0].lines[0].words[0].style.font; + + const altoOutStr = writeAlto({ ocrData: scribe.data.ocr.active }); + + const encoder = new TextEncoder(); + const encoded = encoder.encode(altoOutStr); + + await scribe.terminate(); + await scribe.importFiles({ ocrFiles: [encoded.buffer] }); + + const fontAfter = scribe.data.ocr.active[0].lines[0].words[0].style.font; + + assert.strictEqual(fontBefore, fontAfter, 'Font family should be preserved'); + }).timeout(10000); + + it('Should match original ALTO XML structure after round-trip (content-only comparison)', async () => { + await scribe.terminate(); + await scribe.importFiles([`${ASSETS_PATH_KARMA}/simple_paragraph.alto.xml`]); + + const originalAltoStr = await readTextFileUniversal(`${ASSETS_PATH_KARMA}/simple_paragraph.alto.xml`); + const altoOutStr = writeAlto({ ocrData: scribe.data.ocr.active }); + + const normalizedOriginal = normalizeAlto(originalAltoStr); + const normalizedExported = normalizeAlto(altoOutStr); + + assert.strictEqual(normalizedExported, normalizedOriginal, 'Exported ALTO should match original after normalization'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); diff --git a/tests/module/importDocx.spec.js b/tests/module/importDocx.spec.js new file mode 100644 index 0000000..d9ee7d3 --- /dev/null +++ b/tests/module/importDocx.spec.js @@ -0,0 +1,240 @@ +// Relative imports are required to run in browser. +/* eslint-disable import/no-relative-packages */ +import { assert, config } from '../../node_modules/chai/chai.js'; +import scribe from '../../scribe.js'; +import { ASSETS_PATH_KARMA } from '../constants.js'; + +config.truncateThreshold = 0; // Disable truncation for actual/expected values on assertion failure. + +// Using arrow functions breaks references to `this`. +/* eslint-disable prefer-arrow-callback */ +/* eslint-disable func-names */ + +// Skip tests prior to Node.js EOL (20.x) where the native File class is available. +// While the library should be compatible with earlier versions of Node.js, +// getting every test to run on versions that are already EOL is not a priority. +const itSkipNodeEOL = typeof process === 'undefined' || parseInt(process.versions.node.split('.')[0]) >= 20 ? it : xit; + +describe('Check docx import function.', function () { + this.timeout(10000); + + itSkipNodeEOL('Should import docx file', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/testocr.abbyy.xml`]); + const docxData = await scribe.exportData('docx'); + + await scribe.terminate(); + + const docxFile = new File([docxData], 'test.docx', { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }); + + await scribe.importFiles([docxFile]); + }); + + itSkipNodeEOL('Should correctly import text content from docx', async () => { + const text1 = scribe.data.ocr.active[0].lines[0].words.map((x) => x.text).join(' '); + + assert.include(text1, 'This is a lot of 12 point text'); + }).timeout(10000); + + itSkipNodeEOL('Should correctly import paragraphs from docx', async () => { + assert.isTrue(scribe.data.ocr.active[0].lines.length > 0); + assert.isTrue(scribe.data.ocr.active[0].pars.length > 0); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check export -> import round-trip for docx files.', function () { + this.timeout(10000); + + itSkipNodeEOL('Exporting and importing docx should preserve text content', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/testocr.abbyy.xml`]); + + const originalText = scribe.data.ocr.active.map((page) => page.lines.map((line) => line.words.map((word) => word.text).join(' ')).join('\n')).join('\n\n'); + + const docxData = await scribe.exportData('docx'); + const docxFile = new File([docxData], 'test.docx', { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }); + + await scribe.terminate(); + await scribe.importFiles([docxFile]); + + const importedText = scribe.data.ocr.active.map((page) => page.lines.map((line) => line.words.map((word) => word.text).join(' ')).join('\n')).join('\n\n'); + + assert.include(importedText, 'This is a lot of 12 point text'); + assert.include(importedText, 'The quick brown dog jumped'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check that font styles are preserved in docx round-trip.', function () { + this.timeout(10000); + + itSkipNodeEOL('Bold style is preserved in round-trip', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/complaint_1.abbyy.xml`]); + + const originalBoldWord = scribe.data.ocr.active[1].lines[3].words[0]; + assert.isTrue(originalBoldWord.style.bold); + + const docxData = await scribe.exportData('docx'); + const docxFile = new File([docxData], 'test.docx', { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }); + + await scribe.terminate(); + await scribe.importFiles([docxFile]); + + let foundBoldWord = false; + for (const page of scribe.data.ocr.active) { + for (const line of page.lines) { + for (const word of line.words) { + if (word.style.bold) { + foundBoldWord = true; + break; + } + } + if (foundBoldWord) break; + } + if (foundBoldWord) break; + } + + assert.isTrue(foundBoldWord, 'Should have at least one bold word after round-trip'); + }).timeout(10000); + + itSkipNodeEOL('Italic style is preserved in round-trip', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/E.D.Mich._2_12-cv-13821-AC-DRG_1_0.xml`]); + + const originalItalicWord = scribe.data.ocr.active[0].lines[30].words[0]; + assert.isTrue(originalItalicWord.style.italic); + + const docxData = await scribe.exportData('docx'); + const docxFile = new File([docxData], 'test.docx', { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }); + + await scribe.terminate(); + await scribe.importFiles([docxFile]); + + let foundItalicWord = false; + for (const page of scribe.data.ocr.active) { + for (const line of page.lines) { + for (const word of line.words) { + if (word.style.italic) { + foundItalicWord = true; + break; + } + } + if (foundItalicWord) break; + } + if (foundItalicWord) break; + } + + assert.isTrue(foundItalicWord, 'Should have at least one italic word after round-trip'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check that small caps are preserved in docx round-trip.', function () { + this.timeout(10000); + + itSkipNodeEOL('Small caps style is preserved in round-trip', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/econometrica_example.abbyy.xml`]); + + const originalSmallCapsWord = scribe.data.ocr.active[0].lines[4].words[0]; + const originalText = originalSmallCapsWord.text; + + const docxData = await scribe.exportData('docx'); + const docxFile = new File([docxData], 'test.docx', { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }); + + await scribe.terminate(); + await scribe.importFiles([docxFile]); + + let foundSmallCapsWord = false; + for (const page of scribe.data.ocr.active) { + for (const line of page.lines) { + for (const word of line.words) { + if (word.style.smallCaps) { + foundSmallCapsWord = true; + break; + } + } + if (foundSmallCapsWord) break; + } + if (foundSmallCapsWord) break; + } + + assert.isTrue(foundSmallCapsWord, 'Should have at least one small caps word after round-trip'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check multi-page docx import.', function () { + this.timeout(10000); + + itSkipNodeEOL('Should correctly handle multi-page documents', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.abbyy.xml`]); + + const originalPageCount = scribe.data.ocr.active.length; + + const docxData = await scribe.exportData('docx'); + const docxFile = new File([docxData], 'test.docx', { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }); + + await scribe.terminate(); + await scribe.importFiles([docxFile]); + + assert.isTrue(scribe.data.ocr.active.length > 0); + + for (const page of scribe.data.ocr.active) { + assert.isTrue(page.lines.length > 0 || scribe.data.ocr.active.indexOf(page) > 0); + } + }).timeout(20000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check that font families are preserved in docx round-trip.', function () { + this.timeout(10000); + + itSkipNodeEOL('Font family is preserved in round-trip', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/testocr.abbyy.xml`]); + + const originalFontWord = scribe.data.ocr.active[0].lines[0].words[0]; + const originalFont = originalFontWord.style.font; + assert.isNotNull(originalFont, 'Original word should have a font'); + assert.isString(originalFont, 'Font should be a string'); + + const docxData = await scribe.exportData('docx'); + const docxFile = new File([docxData], 'test.docx', { type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }); + + await scribe.terminate(); + await scribe.importFiles([docxFile]); + + let foundFontWord = false; + for (const page of scribe.data.ocr.active) { + for (const line of page.lines) { + for (const word of line.words) { + if (word.style.font) { + foundFontWord = true; + assert.strictEqual(word.style.font, originalFont, `Font should be preserved as "${originalFont}"`); + break; + } + } + if (foundFontWord) break; + } + if (foundFontWord) break; + } + + assert.isTrue(foundFontWord, 'Should have at least one word with font family after round-trip'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000);