diff --git a/cloud-adapters b/cloud-adapters index 39de490..2a5882c 160000 --- a/cloud-adapters +++ b/cloud-adapters @@ -1 +1 @@ -Subproject commit 39de49069e68cb50c6a9a365f3dc08b54e832ef8 +Subproject commit 2a5882c838598176c008111e1ce684e99cbcf95a diff --git a/dev/createRotationTextPdf.js b/dev/createRotationTextPdf.js index 8bc2514..fadba82 100644 --- a/dev/createRotationTextPdf.js +++ b/dev/createRotationTextPdf.js @@ -7,7 +7,7 @@ import { imageUtils } from '../js/objects/imageObjects.js'; await scribe.init({ font: true }); const images = await importImageFilesP([ - './tests/assets/econometrica_example.png', + './tests/assets/testocr.png', ]); const pageMetricsImages = images.map((image) => { @@ -33,10 +33,11 @@ const pdfStr = await writePdf({ pageMetricsArr: pageMetricsImages, includeImages: true, rotateBackground: true, + rotateOrientation: true, }); const enc = new TextEncoder(); const pdfEnc = enc.encode(pdfStr); -await writeFile('rotation_text_test.pdf', pdfEnc); +await writeFile('./tests/assets/testocr_all_orientations.pdf', pdfEnc); await scribe.terminate(); diff --git a/js/export/pdf/writePdf.js b/js/export/pdf/writePdf.js index 8035be4..b0a49bc 100644 --- a/js/export/pdf/writePdf.js +++ b/js/export/pdf/writePdf.js @@ -114,6 +114,8 @@ const createPdfFontRefs = async (ocrArr) => { * @param {("ebook"|"eval"|"proof"|"invis")} [params.textMode="ebook"] - * @param {boolean} [params.rotateText=false] - * @param {boolean} [params.rotateBackground=false] - + * @param {boolean} [params.rotateOrientation=false] - If true, canvas is adjusted to flip width/height to account for image rotation + * of 90 or 270 degrees. This argument is currently only used in a dev script and may not be the best approach. * @param {dims} [params.dimsLimit] - * @param {number} [params.confThreshHigh=85] - * @param {number} [params.confThreshMed=75] - @@ -131,6 +133,7 @@ export async function writePdf({ textMode = 'ebook', rotateText = false, rotateBackground = false, + rotateOrientation = false, dimsLimit = { width: -1, height: -1 }, confThreshHigh = 85, confThreshMed = 75, @@ -209,6 +212,7 @@ export async function writePdf({ pdfFonts, textMode, angle, + rotateOrientation, rotateText, rotateBackground, confThreshHigh, @@ -336,6 +340,8 @@ ${xrefOffset} * @param {Object} params.pdfFonts * @param {("ebook"|"eval"|"proof"|"invis")} params.textMode - * @param {number} params.angle + * @param {boolean} [params.rotateOrientation=false] - If true, canvas is adjusted to flip width/height to account for image rotation + * of 90 or 270 degrees. This argument is currently only used in a dev script and may not be the best approach. * @param {boolean} [params.rotateText=false] * @param {boolean} [params.rotateBackground=false] * @param {number} [params.confThreshHigh=85] @@ -354,6 +360,7 @@ async function ocrPageToPDF({ pdfFonts, textMode, angle, + rotateOrientation = false, rotateText = false, rotateBackground = false, confThreshHigh = 85, @@ -370,6 +377,11 @@ async function ocrPageToPDF({ const pageIndex = firstObjIndex; let pageObjStr = `${String(pageIndex)} 0 obj\n< 45 && angle < 135 || angle > 225 && angle < 315)) { + pageObjStr = `${String(pageIndex)} 0 obj\n< 0.05) { rotation = angle; } - imageContentObjStr += drawImageCommands(imageName, 0, 0, outputDims.width, outputDims.height, rotation); + + let x = 0; + let y = 0; + if (rotateOrientation && (rotation > 45 && rotation < 135 || rotation > 225 && rotation < 315)) { + x = (outputDims.height - outputDims.width) / 2; + y = (outputDims.width - outputDims.height) / 2; + } + + imageContentObjStr += drawImageCommands(imageName, x, y, outputDims.width, outputDims.height, rotation); } if (noTextContent) { diff --git a/js/generalWorkerMain.js b/js/generalWorkerMain.js index 3c5d410..b92c2b2 100644 --- a/js/generalWorkerMain.js +++ b/js/generalWorkerMain.js @@ -96,6 +96,7 @@ export async function initGeneralWorker() { obj.convertPageAbbyy = wrap('convertPageAbbyy'); obj.convertPageStext = wrap('convertPageStext'); obj.convertDocTextract = wrap('convertDocTextract'); + obj.convertDocAzureDocIntel = wrap('convertDocAzureDocIntel'); obj.convertPageGoogleVision = wrap('convertPageGoogleVision'); obj.convertPageText = wrap('convertPageText'); @@ -192,6 +193,15 @@ export class gs { return gs.schedulerInner.addJob('convertDocTextract', args); }; + /** + * @param {Parameters[0]} args + * @returns {ReturnType} + */ + static convertDocAzureDocIntel = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertDocAzureDocIntel', args); + }; + /** * @param {Parameters[0]} args * @returns {ReturnType} diff --git a/js/global.d.ts b/js/global.d.ts index ef03c94..1e19cdc 100644 --- a/js/global.d.ts +++ b/js/global.d.ts @@ -13,7 +13,7 @@ declare global { // Strings representing supported sources of text. // `stext` indicates the text was extracted directly from a PDF using mupdf. - type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text'; + type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel'; type FontState = { enableOpt: boolean; @@ -330,6 +330,62 @@ declare global { text: string; } + // Azure Document Intelligence types + interface AzureDocIntelPoint { + x: number; + y: number; + } + + interface AzureDocIntelSpan { + offset: number; + length: number; + } + + interface AzureDocIntelWord { + content: string; + polygon: AzureDocIntelPoint[]; + span: AzureDocIntelSpan; + confidence: number; + } + + interface AzureDocIntelLine { + content: string; + polygon: AzureDocIntelPoint[]; + spans: AzureDocIntelSpan[]; + } + + interface AzureDocIntelStyle { + isHandwritten?: boolean; + spans: AzureDocIntelSpan[]; + confidence: number; + } + + interface AzureDocIntelPage { + pageNumber: number; + angle: number; + width: number; + height: number; + unit: 'pixel' | 'inch'; + words: AzureDocIntelWord[]; + lines: AzureDocIntelLine[]; + spans: AzureDocIntelSpan[]; + } + + interface AzureDocIntelAnalyzeResult { + apiVersion: string; + modelId: string; + content: string; + pages: AzureDocIntelPage[]; + styles: AzureDocIntelStyle[]; + } + + interface AzureDocIntelResponse { + status: 'succeeded' | 'failed' | 'running'; + createdDateTime: string; + lastUpdatedDateTime: string; + analyzeResult: AzureDocIntelAnalyzeResult; + } + } export { }; diff --git a/js/import/convertDocAzureDocIntel.js b/js/import/convertDocAzureDocIntel.js new file mode 100644 index 0000000..a429605 --- /dev/null +++ b/js/import/convertDocAzureDocIntel.js @@ -0,0 +1,140 @@ +import ocr from '../objects/ocrObjects.js'; + +import { LayoutDataTablePage } from '../objects/layoutObjects.js'; +import { pass2, pass3 } from './convertPageShared.js'; + +// NOTE: This is a WIP and incomplete. +// The Azure Document Intelligence format is not yet supported. + +const debugMode = false; + +/** + * @param {Object} params + * @param {string} params.ocrStr + * @param {dims[]} params.pageDims - Page metrics to use for the pages + */ +export async function convertDocAzureDocIntel({ ocrStr, pageDims }) { + let ocrData; + try { + ocrData = JSON.parse(ocrStr); + } catch (error) { + throw new Error('Failed to parse Azure Document Intelligence JSON data.'); + } + + if (!ocrData.analyzeResult || !ocrData.analyzeResult.pages || !ocrData.analyzeResult.pages[0]) { + throw new Error('Invalid Azure Document Intelligence format: missing pages data.'); + } + + const analyzeResultPages = /** @type {AzureDocIntelPage[]} */ (ocrData.analyzeResult.pages); + + const resArr = []; + + for (let n = 0; n < analyzeResultPages.length; n++) { + const pageData = analyzeResultPages[n]; + const pageDimsN = pageDims[n]; + + if (!pageData.width || !pageData.height) { + throw new Error('Failed to parse page dimensions.'); + } + + const pageObj = new ocr.OcrPage(n, pageDimsN); + pageObj.textSource = 'azure_doc_intel'; + + if (!pageData.words || pageData.words.length === 0) { + const warn = { char: 'char_error' }; + resArr.push({ + pageObj, charMetricsObj: {}, dataTables: new LayoutDataTablePage(n), warn, + }); + } + + if (pageData.unit !== 'pixel') { + if (!pageDimsN || !pageDimsN.width || !pageDimsN.height) { + throw new Error('Page dimensions must be provided for non-pixel units.'); + } + + const pageDimsMult = { + width: pageDimsN.width / pageData.width, + height: pageDimsN.height / pageData.height, + }; + + pageData.lines.forEach((line) => { + line.polygon = line.polygon.map((val, idx) => (idx % 2 === 0 ? val * pageDimsMult.width : val * pageDimsMult.height)); + }); + + pageData.words.forEach((word) => { + word.polygon = word.polygon.map((val, idx) => (idx % 2 === 0 ? val * pageDimsMult.width : val * pageDimsMult.height)); + }); + } + + for (let i = 0; i < pageData.lines.length; i++) { + const lineWordsInput = /** @type {AzureDocIntelWord[]} */ ([]); + for (let j = 0; j < pageData.lines[i].spans.length; j++) { + const span = pageData.lines[i].spans[j]; + for (let k = 0; k < pageData.words.length; k++) { + const wordSpan = pageData.words[k].span; + if (wordSpan.offset >= span.offset && (wordSpan.offset + wordSpan.length) <= (span.offset + span.length)) { + lineWordsInput.push(pageData.words[k]); + } + } + } + + if (lineWordsInput.length === 0) continue; + + const allX = lineWordsInput.flatMap((w) => w.polygon.filter((_, i) => i % 2 === 0)); + const allY = lineWordsInput.flatMap((w) => w.polygon.filter((_, i) => i % 2 === 1)); + + const lineBbox = { + left: Math.min(...allX), + top: Math.min(...allY), + right: Math.max(...allX), + bottom: Math.max(...allY), + }; + + const baseline = [0, 0]; + + const lineObj = new ocr.OcrLine(pageObj, lineBbox, baseline); + if (debugMode) lineObj.raw = JSON.stringify(lineWordsInput); + + for (let j = 0; j < lineWordsInput.length; j++) { + const wordData = lineWordsInput[j]; + + if (!wordData.content || wordData.content.trim() === '') continue; + + const wordX = wordData.polygon.filter((_, i) => i % 2 === 0); + const wordY = wordData.polygon.filter((_, i) => i % 2 === 1); + + const wordBbox = { + left: Math.min(...wordX), + top: Math.min(...wordY), + right: Math.max(...wordX), + bottom: Math.max(...wordY), + }; + + const wordId = `word_${n + 1}_${pageObj.lines.length + 1}_${j + 1}`; + const wordObj = new ocr.OcrWord(lineObj, wordId, wordData.content, wordBbox); + + wordObj.conf = Math.round((wordData.confidence || 0) * 100); + + if (debugMode) wordObj.raw = JSON.stringify(wordData); + + lineObj.words.push(wordObj); + } + + if (lineObj.words.length > 0) { + pageObj.lines.push(lineObj); + } + } + + const pageAngle = pageData.angle || 0; + pageObj.angle = pageAngle; + + // pass2(pageObj, 0); + const langSet = pass3(pageObj); + + const dataTables = new LayoutDataTablePage(n); + + resArr.push({ pageObj, dataTables, langSet }); + } + + return resArr; +} diff --git a/js/import/convertDocTextract.js b/js/import/convertDocTextract.js index d651a22..aadc723 100644 --- a/js/import/convertDocTextract.js +++ b/js/import/convertDocTextract.js @@ -14,6 +14,30 @@ import { } from '../objects/layoutObjects.js'; import { pass3 } from './convertPageShared.js'; +/** + * + * @param {TextractPoint[]} poly + */ +const detectPolyOrientation = (poly) => { + // 90 degrees clockwise + if (poly[0].X > poly[2].X && poly[0].Y < poly[2].Y) { + return 1; + } + + // 180 degrees + if (poly[0].X > poly[2].X && poly[0].Y > poly[2].Y) { + return 2; + } + + // 90 degrees counter-clockwise + if (poly[0].X < poly[2].X && poly[1].X < poly[3].X && poly[0].Y > poly[2].Y) { + return 3; + } + + // Default + return 0; +}; + /** * @param {Object} params * @param {string|string[]} params.ocrStr - String or array of strings containing Textract JSON data. @@ -43,6 +67,8 @@ export async function convertDocTextract({ ocrStr, pageDims }) { const resArr = []; for (let n = 0; n < pageBlocks.length; n++) { + const pageBlock = pageBlocks[n]; + // Textract uses normalized coordinates (0-1), we need to convert to pixels // We'll assume standard page dimensions since Textract doesn't provide pixel dimensions const pageDimsN = pageDims[n]; @@ -50,6 +76,13 @@ export async function convertDocTextract({ ocrStr, pageDims }) { throw new Error(`No page dimensions provided for page ${n + 1}.`); } + const pagePoly = pageBlock.Geometry && pageBlock.Geometry.Polygon ? pageBlock.Geometry.Polygon : null; + if (!pagePoly) throw new Error(`No page polygon data for page ${n + 1}.`); + + const pageOrientation = detectPolyOrientation(pagePoly); + + console.log(`Page ${n + 1} orientation: ${pageOrientation * 90} degrees`); + const pageObj = new ocr.OcrPage(n, pageDimsN); const lineBlocks = blocks.filter((block) => block.BlockType === 'LINE' && (!block.Page && n === 0 || block.Page === n + 1)); @@ -81,9 +114,6 @@ export async function convertDocTextract({ ocrStr, pageDims }) { blockMap.set(block.Id, block); }); - /** @type {Array} */ - const angleRisePage = []; - // Process layout blocks (paragraphs) and their lines const layoutBlocks = blocks.filter((block) => block.BlockType && block.BlockType.startsWith('LAYOUT_'), ); @@ -104,22 +134,13 @@ export async function convertDocTextract({ ocrStr, pageDims }) { // Process lines and convert to OCR format const lineObjMap = new Map(); lineBlocks.forEach((lineBlock, lineIndex) => { - const lineObj = convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, n, lineIndex, pageDimsN); + const lineObj = convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, n, lineIndex, pageDimsN, pageOrientation); if (lineObj) { pageObj.lines.push(lineObj); lineObjMap.set(lineBlock.Id, lineObj); - - // Collect baseline slopes for angle calculation - if (lineObj.baseline && Math.abs(lineObj.baseline[0]) > 0.001) { - angleRisePage.push(lineObj.baseline[0]); - } } }); - // Calculate page angle from line baselines - const angleRiseMedian = mean50(angleRisePage) || 0; - const angleOut = Math.asin(angleRiseMedian) * (180 / Math.PI); - pageObj.angle = angleOut; pageObj.textSource = 'textract'; // Create paragraphs from Textract layout blocks @@ -155,45 +176,133 @@ export async function convertDocTextract({ ocrStr, pageDims }) { * @param {number} pageNum - Page number (0-indexed) * @param {number} lineIndex - Index of the line block on the page * @param {dims} pageDims - Dimensions of the page in pixels + * @param {number} pageOrientation - Orientation of the page (0-3) */ -function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, pageNum, lineIndex, pageDims) { +function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, pageNum, lineIndex, pageDims, pageOrientation) { // `lineBlock.Page` will be undefined when the entire document is a single page. if (!lineBlock.Text || !lineBlock.Geometry || (lineBlock.Page || 1) - 1 !== pageNum) return null; // Convert normalized coordinates to pixels const bboxLine = convertBoundingBox(lineBlock.Geometry.BoundingBox, pageDims); - const polyLine = convertPolygon(lineBlock.Geometry.Polygon, pageDims); - - // Calculate baseline from geometry - Textract doesn't provide explicit baseline - // We'll estimate it based on the polygon points if available - let baselineSlope = 0; - if (polyLine.br.x !== polyLine.bl.x) { - baselineSlope = (polyLine.br.y - polyLine.bl.y) / (polyLine.br.x - polyLine.bl.x); - } + const polyLine0 = convertPolygon(lineBlock.Geometry.Polygon, pageDims, pageOrientation); + let polyLine = /** @type {Polygon} */ (JSON.parse(JSON.stringify(polyLine0))); - const baseline = [baselineSlope, 0]; + const baseline = [0, 0]; const lineObj = new ocr.OcrLine(pageObj, bboxLine, baseline); const childIds = relationshipMap.get(lineBlock.Id) || []; - childIds.forEach((wordId, wordIndex) => { - const wordBlock = blockMap.get(wordId); - if (wordBlock && wordBlock.BlockType === 'WORD') { - if (!wordBlock.Text || !wordBlock.Geometry) return; + const wordBlocks = /** @type {TextractBlock[]} */ (childIds.map((wordId) => blockMap.get(wordId)).filter((block) => block && block.BlockType === 'WORD')); - const bboxWord = convertBoundingBox(wordBlock.Geometry.BoundingBox, pageDims); - const id = `word_${pageNum + 1}_${lineIndex + 1}_${wordIndex + 1}`; + wordBlocks.forEach((wordBlock, wordIndex) => { + const bboxWord = convertBoundingBox(wordBlock.Geometry.BoundingBox, pageDims); + const id = `word_${pageNum + 1}_${lineIndex + 1}_${wordIndex + 1}`; - const poly = convertPolygon(wordBlock.Geometry.Polygon, pageDims); + const poly = convertPolygon(wordBlock.Geometry.Polygon, pageDims, pageOrientation); - const wordObj = new ocr.OcrWord(lineObj, id, wordBlock.Text, bboxWord, poly); - wordObj.conf = wordBlock.Confidence || 100; + const wordObj = new ocr.OcrWord(lineObj, id, wordBlock.Text, bboxWord, poly); + wordObj.conf = wordBlock.Confidence || 100; - lineObj.words.push(wordObj); - } + lineObj.words.push(wordObj); }); + if (!wordBlocks.length || !lineObj.words.length) { + console.warn(`Warning: Line with no words on page ${pageNum + 1}, line index ${lineIndex + 1}. Skipping line.`); + return null; + } + + const lineOrientation = (wordBlocks[0].Geometry.RotationAngle || 0) / 90; + + // @ts-ignore + lineObj.orientation = pageOrientation - lineOrientation; + if (lineObj.orientation < 0) { + lineObj.orientation += 4; + } + + if (lineObj.orientation === 1) { + const lineBox = { ...lineObj.bbox }; + lineObj.bbox.left = lineBox.top; + lineObj.bbox.top = pageDims.width - lineBox.right; + lineObj.bbox.right = lineBox.bottom; + lineObj.bbox.bottom = pageDims.width - lineBox.left; + lineObj.words.forEach((word) => { + const wordBox = { ...word.bbox }; + word.bbox.left = word.bbox.top; + word.bbox.top = pageDims.width - wordBox.right; + word.bbox.right = wordBox.bottom; + word.bbox.bottom = pageDims.width - wordBox.left; + word.poly = { + tl: { x: word.poly.tr.y, y: pageDims.width - word.poly.tr.x }, + tr: { x: word.poly.br.y, y: pageDims.width - word.poly.br.x }, + br: { x: word.poly.bl.y, y: pageDims.width - word.poly.bl.x }, + bl: { x: word.poly.tl.y, y: pageDims.width - word.poly.tl.x }, + }; + }); + polyLine = { + tl: { x: polyLine0.tr.y, y: pageDims.width - polyLine0.tr.x }, + tr: { x: polyLine0.br.y, y: pageDims.width - polyLine0.br.x }, + br: { x: polyLine0.bl.y, y: pageDims.width - polyLine0.bl.x }, + bl: { x: polyLine0.tl.y, y: pageDims.width - polyLine0.tl.x }, + }; + } else if (lineObj.orientation === 2) { + const lineBox = { ...lineObj.bbox }; + lineObj.bbox.left = pageDims.width - lineBox.right; + lineObj.bbox.top = pageDims.height - lineBox.bottom; + lineObj.bbox.right = pageDims.width - lineBox.left; + lineObj.bbox.bottom = pageDims.height - lineBox.top; + lineObj.words.forEach((word) => { + const wordBox = { ...word.bbox }; + word.bbox.left = pageDims.width - wordBox.right; + word.bbox.top = pageDims.height - wordBox.bottom; + word.bbox.right = pageDims.width - wordBox.left; + word.bbox.bottom = pageDims.height - wordBox.top; + word.poly = { + tl: { x: pageDims.width - word.poly.br.x, y: pageDims.height - word.poly.br.y }, + tr: { x: pageDims.width - word.poly.bl.x, y: pageDims.height - word.poly.bl.y }, + br: { x: pageDims.width - word.poly.tl.x, y: pageDims.height - word.poly.tl.y }, + bl: { x: pageDims.width - word.poly.tr.x, y: pageDims.height - word.poly.tr.y }, + }; + }); + polyLine = { + tl: { x: pageDims.width - polyLine0.br.x, y: pageDims.height - polyLine0.br.y }, + tr: { x: pageDims.width - polyLine0.bl.x, y: pageDims.height - polyLine0.bl.y }, + br: { x: pageDims.width - polyLine0.tl.x, y: pageDims.height - polyLine0.tl.y }, + bl: { x: pageDims.width - polyLine0.tr.x, y: pageDims.height - polyLine0.tr.y }, + }; + } else if (lineObj.orientation === 3) { + const lineBox = { ...lineObj.bbox }; + lineObj.bbox.left = pageDims.height - lineBox.bottom; + lineObj.bbox.top = lineBox.left; + lineObj.bbox.right = pageDims.height - lineBox.top; + lineObj.bbox.bottom = lineBox.right; + lineObj.words.forEach((word) => { + const wordBox = { ...word.bbox }; + word.bbox.left = pageDims.height - wordBox.bottom; + word.bbox.top = wordBox.left; + word.bbox.right = pageDims.height - wordBox.top; + word.bbox.bottom = wordBox.right; + word.poly = { + tl: { x: pageDims.height - word.poly.bl.y, y: word.poly.bl.x }, + tr: { x: pageDims.height - word.poly.tl.y, y: word.poly.tl.x }, + br: { x: pageDims.height - word.poly.tr.y, y: word.poly.tr.x }, + bl: { x: pageDims.height - word.poly.br.y, y: word.poly.br.x }, + }; + }); + polyLine = { + tl: { x: pageDims.height - polyLine0.bl.y, y: polyLine0.bl.x }, + tr: { x: pageDims.height - polyLine0.tl.y, y: polyLine0.tl.x }, + br: { x: pageDims.height - polyLine0.tr.y, y: polyLine0.tr.x }, + bl: { x: pageDims.height - polyLine0.br.y, y: polyLine0.br.x }, + }; + } + + // Calculate baseline from geometry - Textract doesn't provide explicit baseline + // We'll estimate it based on the polygon points if available + if (polyLine.br.x !== polyLine.bl.x) { + lineObj.baseline[0] = (polyLine.br.y - polyLine.bl.y) / (polyLine.br.x - polyLine.bl.x); + } + const descCharRegex = new RegExp(`[${descCharArr.join('')}]`); const ascCharRegex = new RegExp(`[${ascCharArr.join('')}]`); const xCharRegex = new RegExp(`[${xCharArr.join('')}]`); @@ -257,11 +366,11 @@ function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, page }); const nonDescBottomDelta = mean50(nonDescBottomDeltaArr); - const lineHeight = ((polyLine.tr.y - polyLine.br.y) + (polyLine.tr.y - polyLine.br.y)) / 2; - if (Number.isFinite(nonDescBottomDelta) && nonDescBottomDelta < lineObj.bbox.bottom && nonDescBottomDelta > (lineHeight / 2)) { + const lineHeight = ((polyLine.br.y - polyLine.tr.y) + (polyLine.bl.y - polyLine.tl.y)) / 2; + if (Number.isFinite(nonDescBottomDelta) && nonDescBottomDelta < lineObj.bbox.bottom && nonDescBottomDelta < (lineHeight / 2)) { lineObj.baseline[1] = nonDescBottomDelta - (lineObj.bbox.bottom - polyLine.bl.y); - } else if (descWords.length > 0) { - lineObj.baseline[1] = lineHeight / 3 - (lineObj.bbox.bottom - polyLine.bl.y); + } else { + lineObj.baseline[1] = lineHeight * -1 / 3 - (lineObj.bbox.bottom - polyLine.bl.y); } // TODO: Properly process metrics when these are negative. @@ -270,7 +379,7 @@ function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, page if (xHeight && xHeight > 0) lineObj.xHeight = xHeight; if (ascHeight && ascHeight > 0) lineObj.ascHeight = ascHeight; - return lineObj.words.length > 0 ? lineObj : null; + return lineObj; } /** @@ -292,25 +401,48 @@ function convertBoundingBox(textractBbox, pageDims) { * * @param {TextractPoint[]} textractPolygon * @param {dims} pageDims + * @param {number} orientation * @return {Polygon} */ -function convertPolygon(textractPolygon, pageDims) { +function convertPolygon(textractPolygon, pageDims, orientation) { + let br = 2; + let bl = 3; + let tr = 1; + let tl = 0; + + if (orientation === 1) { + br = 1; + bl = 2; + tr = 0; + tl = 3; + } else if (orientation === 2) { + br = 0; + bl = 1; + tr = 3; + tl = 2; + } else if (orientation === 3) { + br = 3; + bl = 0; + tr = 2; + tl = 1; + } + return { br: { - x: Math.round(textractPolygon[2].X * pageDims.width), - y: Math.round(textractPolygon[2].Y * pageDims.height), + x: Math.round(textractPolygon[br].X * pageDims.width), + y: Math.round(textractPolygon[br].Y * pageDims.height), }, bl: { - x: Math.round(textractPolygon[3].X * pageDims.width), - y: Math.round(textractPolygon[3].Y * pageDims.height), + x: Math.round(textractPolygon[bl].X * pageDims.width), + y: Math.round(textractPolygon[bl].Y * pageDims.height), }, tr: { - x: Math.round(textractPolygon[1].X * pageDims.width), - y: Math.round(textractPolygon[1].Y * pageDims.height), + x: Math.round(textractPolygon[tr].X * pageDims.width), + y: Math.round(textractPolygon[tr].Y * pageDims.height), }, tl: { - x: Math.round(textractPolygon[0].X * pageDims.width), - y: Math.round(textractPolygon[0].Y * pageDims.height), + x: Math.round(textractPolygon[tl].X * pageDims.width), + y: Math.round(textractPolygon[tl].Y * pageDims.height), }, }; } diff --git a/js/import/import.js b/js/import/import.js index 591d467..4beee11 100644 --- a/js/import/import.js +++ b/js/import/import.js @@ -313,7 +313,7 @@ export async function importFiles(files) { let pageCount; let pageCountImage; - /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ + /** @type {TextSource} */ let format; let reimportHocrMode = false; @@ -457,7 +457,7 @@ export async function importFiles(files) { await convertOCR(ocrAllRaw.active, true, format, oemName, reimportHocrMode, pageMetricsAll).then(async () => { // Skip this step if optimization info was already restored from a previous session, // or if using stext/textract (which are character-level but not visually accurate). - if (!existingOpt && !['stext', 'textract'].includes(format)) { + if (!existingOpt && !['stext', 'textract', 'google_vision', 'azure_doc_intel'].includes(format)) { await checkCharWarn(convertPageWarn); const charMetrics = calcCharMetricsFromPages(ocrAll.active); diff --git a/js/import/importOCR.js b/js/import/importOCR.js index cef15a8..f575887 100644 --- a/js/import/importOCR.js +++ b/js/import/importOCR.js @@ -12,7 +12,7 @@ export const splitHOCRStr = (hocrStrAll) => hocrStrAll.replace(/[\s\S]*?/, * * @param {string} ocrStr - The OCR string to detect the format of. * @param {string} [ext] - The file extension of the OCR file. - * @returns {"hocr" | "stext" | "abbyy" | "textract" | "google_vision" | "text" | null} + * @returns {?TextSource} */ const detectOcrFormat = (ocrStr, ext) => { if (ext) { @@ -31,22 +31,37 @@ const detectOcrFormat = (ocrStr, ext) => { if (!!node2 && !!/abbyy/i.test(node2)) { return 'abbyy'; - } if (!!node2 && !!/ a.context.pageNumber - b.context.pageNumber) + .map((resp) => JSON.stringify(resp)); + } + } else if (format === 'azure_doc_intel') { + hocrRaw = [hocrStrAll]; } else if (format === 'abbyy') { hocrRaw = hocrStrAll.split(/(?= (metrics.dims)); + const res = await gs.convertDocAzureDocIntel({ ocrStr: ocrRawArr, pageDims }); + for (let n = 0; n < res.length; n++) { + await convertPageCallback(res[n], n, mainData, engineName); + } + return; + } + if (format === 'text') { const res = await gs.convertPageText({ textStr: ocrRawArr[0] }); diff --git a/js/worker/generalWorker.js b/js/worker/generalWorker.js index 9ee67c7..6daae00 100644 --- a/js/worker/generalWorker.js +++ b/js/worker/generalWorker.js @@ -3,6 +3,7 @@ import { convertPageBlocks } from '../import/convertPageBlocks.js'; import { convertPageHocr } from '../import/convertPageHocr.js'; import { convertPageStext } from '../import/convertPageStext.js'; import { convertDocTextract } from '../import/convertDocTextract.js'; +import { convertDocAzureDocIntel } from '../import/convertDocAzureDocIntel.js'; import { convertPageGoogleVision } from '../import/convertPageGoogleVision.js'; import { convertPageText } from '../import/convertPageText.js'; @@ -402,6 +403,7 @@ const handleMessage = async (data) => { convertPageHocr, convertPageStext, convertDocTextract, + convertDocAzureDocIntel, convertPageGoogleVision, convertPageBlocks, convertPageText,