diff --git a/cli/main.js b/cli/main.js index 9a4db65..c323d57 100644 --- a/cli/main.js +++ b/cli/main.js @@ -103,7 +103,7 @@ async function main(func, params) { const debugDir = `${outputDir}/${outputStem}_debug`; fs.mkdirSync(debugDir, { recursive: true }); const outputPathCsv = `${debugDir}/_debug.csv`; - scribe.utils.writeDebugCsv(scribe.data.ocr.active, outputPathCsv); + scribe.utils.writeDebugCsv({ pages: scribe.data.ocr.active, fileName: outputPathCsv }); scribe.utils.dumpDebugImages(debugDir); scribe.utils.dumpHOCR(debugDir); diff --git a/js/clear.js b/js/clear.js index 7305b8f..104ec85 100644 --- a/js/clear.js +++ b/js/clear.js @@ -5,7 +5,7 @@ import { layoutRegions, ocrAll, ocrAllRaw, - pageMetricsArr, + pageMetricsAll, } from './containers/dataContainer.js'; import { FontCont } from './containers/fontContainer.js'; import { ImageCache } from './containers/imageContainer.js'; @@ -19,7 +19,7 @@ export function clearData() { ocrAllRaw.active = []; layoutRegions.pages.length = 0; layoutDataTables.pages.length = 0; - pageMetricsArr.length = 0; + pageMetricsAll.length = 0; convertPageWarn.length = 0; ImageCache.clear(); FontCont.clear(); diff --git a/js/containers/dataContainer.js b/js/containers/dataContainer.js index 345e2b8..f4b7b41 100644 --- a/js/containers/dataContainer.js +++ b/js/containers/dataContainer.js @@ -63,7 +63,7 @@ export const ocrAll = { active: [] }; export const ocrAllRaw = { active: [] }; /** @type {Array} */ -export const pageMetricsArr = []; +export const pageMetricsAll = []; /** * Class that stores various debug data. diff --git a/js/containers/imageContainer.js b/js/containers/imageContainer.js index 12d6dfa..a62b93f 100644 --- a/js/containers/imageContainer.js +++ b/js/containers/imageContainer.js @@ -5,7 +5,7 @@ import { import { initMuPDFWorker } from '../../mupdf/mupdf-async.js'; import { updateFontContWorkerMain } from '../fontContainerMain.js'; -import { pageMetricsArr } from './dataContainer.js'; +import { pageMetricsAll } from './dataContainer.js'; import { FontCont, FontContainerFont, @@ -13,7 +13,7 @@ import { } from './fontContainer.js'; import { gs } from '../generalWorkerMain.js'; -import { imageUtils } from '../objects/imageObjects.js'; +import { imageUtils, ImageWrapper } from '../objects/imageObjects.js'; import { range } from '../utils/miscUtils.js'; import { opt } from './app.js'; @@ -42,32 +42,6 @@ export class MuPDFScheduler { } } -export class ImageWrapper { - /** - * @param {number} n - Page number - * @param {string} imageStr - Base-64 encoded image string. Should start with "data:image/png" or "data:image/jpeg". - * @param {string} colorMode - Color mode ("color", "gray", or "binary"). - * @param {boolean} rotated - Whether image has been rotated. - * @param {boolean} upscaled - Whether image has been upscaled. - * - * All properties of this object must be serializable, as ImageWrapper objects are sent between threads. - * This means that no promises can be used. - */ - constructor(n, imageStr, colorMode, rotated = false, upscaled = false) { - this.n = n; - this.src = imageStr; - const format0 = imageStr.match(/^data:image\/(png|jpeg)/)?.[1]; - if (!format0 || !['png', 'jpeg'].includes(format0)) throw new Error(`Invalid image format: ${format0}`); - this.format = format0; - this._dims = null; - this.rotated = rotated; - this.upscaled = upscaled; - this.colorMode = colorMode; - /** @type {?ImageBitmap} */ - this.imageBitmap = null; - } -} - /** * @typedef {Object} ImageProperties * @property {boolean} [rotated] @@ -126,7 +100,7 @@ export class ImageCache { colorMode = color ? 'color' : 'gray'; } - let pageAngle = pageMetricsArr[n].angle || 0; + let pageAngle = pageMetricsAll[n].angle || 0; if (Math.abs(pageAngle) < 0.05) pageAngle = 0; // If no preference is specified for rotation, default to true. @@ -213,7 +187,7 @@ export class ImageCache { if (ImageCache.inputModes.image) { return ImageCache.nativeSrc[n]; } if (ImageCache.inputModes.pdf) { - const pageMetrics = pageMetricsArr[n]; + const pageMetrics = pageMetricsAll[n]; const targetWidth = pageMetrics.dims.width; const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width); const muPDFScheduler = await ImageCache.getMuPDFScheduler(); @@ -232,7 +206,7 @@ export class ImageCache { * @param {boolean} [saveNativeImage=true] - Whether the native image should be saved. */ static transformImage = async (inputImage, n, props, saveNativeImage = true) => { - let pageAngle = pageMetricsArr[n].angle || 0; + let pageAngle = pageMetricsAll[n].angle || 0; if (Math.abs(pageAngle) < 0.05) pageAngle = 0; // If no preference is specified for rotation, default to true. @@ -245,8 +219,8 @@ export class ImageCache { await gs.getGeneralScheduler(); const resPromise = (async () => { - // Wait for non-rotated version before replacing with promise - if (typeof process === 'undefined') await gs.initTesseract({ anyOk: true }); + // Wait for non-rotated version before replacing with promise + await gs.initTesseract({ anyOk: true }); return gs.recognize({ image: inputImage.src, options: { rotateRadians: angleArg, upscale: upscaleArg }, @@ -280,7 +254,7 @@ export class ImageCache { return { native: undefined, binary: undefined }; } - const significantRotation = Math.abs(pageMetricsArr[n].angle || 0) > 0.05; + const significantRotation = Math.abs(pageMetricsAll[n].angle || 0) > 0.05; const newNative = !ImageCache.native[n] || !imageUtils.compatible(ImageCache.nativeProps[n], props, significantRotation); const newBinary = !nativeOnly && (!ImageCache.binary[n] || !imageUtils.compatible(ImageCache.binaryProps[n], props, significantRotation)); @@ -426,7 +400,7 @@ export class ImageCache { ImageCache.pdfDims300.forEach((x, i) => { const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) }; - pageMetricsArr[i] = new PageMetrics(pageDims); + pageMetricsAll[i] = new PageMetrics(pageDims); }); // WIP: Extract fonts embedded in PDFs. diff --git a/js/coordinates.js b/js/coordinates.js index 423d7ae..3ea58a4 100644 --- a/js/coordinates.js +++ b/js/coordinates.js @@ -3,7 +3,7 @@ // Image Coordinate Space: coordinate space of a particular image // Canvas Coordinate Space: coordinate space of canvas, used for user interactions -import { pageMetricsArr } from './containers/dataContainer.js'; +import { pageMetricsAll } from './containers/dataContainer.js'; import { ImageCache } from './containers/imageContainer.js'; /** @@ -27,7 +27,7 @@ function rotateBoundingBox(boundingBox, rotateAngle, n) { let angleAdjXRect = 0; let angleAdjYRect = 0; - const pageDims = pageMetricsArr[n].dims; + const pageDims = pageMetricsAll[n].dims; const sinAngle = Math.sin(rotateAngle * (Math.PI / 180)); const cosAngle = Math.cos(rotateAngle * (Math.PI / 180)); @@ -103,7 +103,7 @@ async function ocrToImage(ocrCoords, n, binary = false) { if (imageN.rotated) { // Otherwise, we must also account for rotation applied by the canvas - const rotateAngle = (pageMetricsArr[n].angle || 0) * -1; + const rotateAngle = (pageMetricsAll[n].angle || 0) * -1; rotateBoundingBox(ocrCoords, rotateAngle, n); } diff --git a/js/debug.js b/js/debug.js index 08097f5..eca510e 100644 --- a/js/debug.js +++ b/js/debug.js @@ -1,5 +1,5 @@ import { opt } from './containers/app.js'; -import { pageMetricsArr } from './containers/dataContainer.js'; +import { pageMetricsAll } from './containers/dataContainer.js'; import { ImageCache } from './containers/imageContainer.js'; import { gs } from './generalWorkerMain.js'; import { loadImageElem } from './utils/imageUtils.js'; @@ -125,7 +125,7 @@ export async function renderPageStatic(page) { const res = gs.renderPageStaticImp({ page, image, - angle: pageMetricsArr[page.n].angle, + angle: pageMetricsAll[page.n].angle, }); return res; diff --git a/js/export/export.js b/js/export/export.js index b59a77e..3dea5c9 100644 --- a/js/export/export.js +++ b/js/export/export.js @@ -1,11 +1,11 @@ import { inputData, opt } from '../containers/app.js'; import { - layoutDataTables, layoutRegions, ocrAll, pageMetricsArr, + layoutDataTables, layoutRegions, ocrAll, pageMetricsAll, } from '../containers/dataContainer.js'; import { ImageCache } from '../containers/imageContainer.js'; import { reorderOcrPage } from '../modifyOCR.js'; import { saveAs } from '../utils/miscUtils.js'; -import { writePdf } from './writePdf.js'; +import { writePdf } from './pdf/writePdf.js'; import { writeHocr } from './writeHocr.js'; import { writeText } from './writeText.js'; import { writeHtml } from './writeHtml.js'; @@ -45,8 +45,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { const dimsLimit = { width: -1, height: -1 }; if (opt.standardizePageSize) { for (let i = minPage; i <= maxPage; i++) { - dimsLimit.height = Math.max(dimsLimit.height, pageMetricsArr[i].dims.height); - dimsLimit.width = Math.max(dimsLimit.width, pageMetricsArr[i].dims.width); + dimsLimit.height = Math.max(dimsLimit.height, pageMetricsAll[i].dims.height); + dimsLimit.width = Math.max(dimsLimit.width, pageMetricsAll[i].dims.width); } } @@ -58,10 +58,30 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { const rotateText = !rotateBackground; + const includeImages = false; + /** @type {ImageWrapper[]} */ + let images = []; + if (includeImages) { + images = await Promise.all(ImageCache.nativeSrc); + } + // Page sizes should not be standardized at this step, as the overlayText/overlayTextImage functions will perform this, // and assume that the overlay PDF is the same size as the input images. - const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, rotateText, rotateBackground, - { width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100); + const pdfStr = await writePdf({ + ocrArr: ocrDownload, + pageMetricsArr: pageMetricsAll, + minpage: minPage, + maxpage: maxPage, + textMode: opt.displayMode, + rotateText, + rotateBackground, + dimsLimit: { width: -1, height: -1 }, + confThreshHigh: opt.confThreshHigh, + confThreshMed: opt.confThreshMed, + proofOpacity: opt.overlayOpacity / 100, + images, + includeImages, + }); const enc = new TextEncoder(); const pdfEnc = enc.encode(pdfStr); @@ -121,7 +141,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { await w.convertImageStart({ humanReadable: opt.humanReadablePDF }); for (let i = minPage; i < maxPage + 1; i++) { - /** @type {import('../containers/imageContainer.js').ImageWrapper} */ + /** @type {ImageWrapper} */ let image; if (binary) { image = await ImageCache.getBinary(i, props); @@ -134,7 +154,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { // Angle the PDF viewer is instructed to rotated the image by. // This method is currently only used when rotation is needed but the user's (unrotated) source images are being used. // If the images are being rendered, then rotation is expected to be applied within the rendering process. - const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsArr[i].angle || 0) * -1 : 0; + const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsAll[i].angle || 0) * -1 : 0; await w.convertImageAddPage({ image: image.src, i, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, angle: angleImagePdf, @@ -157,8 +177,19 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { w.freeDocument(pdfOverlay); } else { - const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed, - opt.overlayOpacity / 100); + const pdfStr = await writePdf({ + ocrArr: ocrDownload, + pageMetricsArr: pageMetricsAll, + minpage: minPage, + maxpage: maxPage, + textMode: opt.displayMode, + rotateText: false, + rotateBackground: true, + dimsLimit, + confThreshHigh: opt.confThreshHigh, + confThreshMed: opt.confThreshMed, + proofOpacity: opt.overlayOpacity / 100, + }); // The PDF is still run through muPDF, even thought in eBook mode no background layer is added. // This is because muPDF cleans up the PDF we made in the previous step, including: @@ -186,7 +217,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { w.freeDocument(pdf); } } else if (format === 'hocr') { - content = writeHocr(ocrDownload, minPage, maxPage); + content = writeHocr({ ocrData: ocrDownload, minValue: minPage, maxValue: maxPage }); } else if (format === 'html') { const images = /** @type {Array} */ ([]); if (opt.includeImages) { @@ -218,18 +249,29 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) { ocrPages: ocrDownload, images, minpage: minPage, maxpage: maxPage, reflowText: opt.reflow, removeMargins: opt.removeMargins, }); } else if (format === 'txt') { - content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false); + content = writeText({ + ocrCurrent: ocrDownload, + minpage: minPage, + maxpage: maxPage, + reflowText: opt.reflow, + docxMode: false, + }); // Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools. // @ts-ignore } else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'docx') { // Less common export formats are loaded dynamically to reduce initial load time. const writeDocx = (await import('./writeDocx.js')).writeDocx; - content = await writeDocx(ocrDownload, minPage, maxPage); + content = await writeDocx({ hocrCurrent: ocrDownload, minpage: minPage, maxpage: maxPage }); // @ts-ignore } else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'xlsx') { // Less common export formats are loaded dynamically to reduce initial load time. const writeXlsx = (await import('./writeTabular.js')).writeXlsx; - content = await writeXlsx(ocrDownload, layoutDataTables.pages, minPage, maxPage); + content = await writeXlsx({ + ocrPageArr: ocrDownload, + layoutPageArr: layoutDataTables.pages, + minpage: minPage, + maxpage: maxPage, + }); } else if (format === 'scribe') { const data = { ocr: removeCircularRefsOcr(ocrDownload), diff --git a/js/export/exportDebugCsv.js b/js/export/exportDebugCsv.js index d665c58..6e08fb4 100644 --- a/js/export/exportDebugCsv.js +++ b/js/export/exportDebugCsv.js @@ -39,11 +39,12 @@ export const convertToCsv = (data) => { /** * - * @param {Array} pages - * @param {string} fileName + * @param {Object} params + * @param {Array} params.pages + * @param {string} params.fileName * @returns */ -export const writeDebugCsv = (pages, fileName) => { +export const writeDebugCsv = ({ pages, fileName }) => { let csvStr = ''; for (let i = 0; i < pages.length; i++) { diff --git a/js/export/pdf/writePdf.js b/js/export/pdf/writePdf.js new file mode 100644 index 0000000..000981f --- /dev/null +++ b/js/export/pdf/writePdf.js @@ -0,0 +1,389 @@ +import { FontCont } from '../../containers/fontContainer.js'; + +import { createEmbeddedFontType0, createEmbeddedFontType1 } from './writePdfFonts.js'; +import { createEmbeddedImages, createImageResourceDict, drawImageCommands } from './writePdfImages.js'; + +import { opt } from '../../containers/app.js'; +import { ocrPageToPDFStream } from './writePdfText.js'; +import { getDistinctCharsFont, subsetFont } from '../../utils/fontUtils.js'; + +// Creates 3 PDF objects necessary to embed font. +// These are (1) the font dictionary, (2) the font descriptor, and (3) the font file, +// which will be located at objects firstObjIndex, firstObjIndex + 1, and firstObjIndex + 2 (respectively). + +/** + * Create a PDF from an array of ocrPage objects. + * + * @param {Object} params + * @param {Array} params.ocrArr - + * @param {PageMetrics[]} params.pageMetricsArr - + * @param {number} [params.minpage=0] - + * @param {number} [params.maxpage=-1] - + * @param {("ebook"|"eval"|"proof"|"invis")} [params.textMode="ebook"] - + * @param {boolean} [params.rotateText=false] - + * @param {boolean} [params.rotateBackground=false] - + * @param {dims} [params.dimsLimit] - + * @param {number} [params.confThreshHigh=85] - + * @param {number} [params.confThreshMed=75] - + * @param {number} [params.proofOpacity=0.8] - + * @param {Array} [params.images=[]] - Array of images to include in PDF + * @param {boolean} [params.includeImages=false] - Whether to include images in the PDF + * + * A valid PDF will be created if an empty array is provided for `ocrArr`, as long as `maxpage` is set manually. + */ +export async function writePdf({ + ocrArr, + pageMetricsArr, + minpage = 0, + maxpage = -1, + textMode = 'ebook', + rotateText = false, + rotateBackground = false, + dimsLimit = { width: -1, height: -1 }, + confThreshHigh = 85, + confThreshMed = 75, + proofOpacity = 0.8, + images = [], + includeImages = false, +}) { + if (!FontCont.raw) throw new Error('No fonts loaded.'); + + if (maxpage === -1) { + maxpage = ocrArr.length - 1; + } + + // This can happen if (1) `ocrArr` is length 0 and (2) `maxpage` is left as the default (-1). + if (maxpage < 0) throw new Error('PDF with negative page count requested.'); + + let fontI = 0; + let objectI = 3; + /** @type {Object} */ + const pdfFonts = {}; + /** @type {{familyKey: string, key: string}[]} */ + const pdfFontRefs = []; + /** @type {string[][]} */ + const pdfFontObjStrArr = []; + /** @type {Set} */ + const pdfFontsUsed = new Set(); + + /** + * + * @param {string} familyKey + * @param {FontContainerFamily} familyObj + */ + const addFontFamilyRef = async (familyKey, familyObj) => { + pdfFonts[familyKey] = {}; + for (const [key, value] of Object.entries(familyObj)) { + // This should include both (1) if this is a standard 14 font and (2) if characters outside of the Windows-1252 range are used. + // If the latter is true, then a composite font is needed, even if the font is a standard 14 font. + // TODO: We currently have no mechanism for resolving name conflicts between fonts in the base and overlay document. + // As a workaround, we use the names `/FO[n]` rather than the more common `/F[n]`. + // However, this likely will cause issues if this application is used to create visible text, and then the resulting PDF is uploaded. + // This would move the fonts from the overlay document to the base document, and the names would conflict. + const isStandardFont = false; + if (isStandardFont) { + pdfFonts[familyKey][key] = { + type: 1, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: value.opentype, + }; + pdfFontRefs.push({ familyKey, key }); + pdfFontObjStrArr.push(null); + objectI += 3; + } else { + pdfFonts[familyKey][key] = { + type: 0, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: value.opentype, + }; + pdfFontRefs.push({ familyKey, key }); + pdfFontObjStrArr.push(null); + objectI += 6; + } + fontI++; + } + }; + + // Create reference to all fonts. + // Only the fonts that are actually used will be included in the final PDF. + for (const familyKeyI of Object.keys(FontCont.raw)) { + const useOpt = FontCont.useOptFamily(familyKeyI); + const familyObjI = { + normal: useOpt && FontCont.opt?.[familyKeyI]?.normal ? FontCont.opt[familyKeyI].normal : FontCont.raw[familyKeyI].normal, + italic: useOpt && FontCont.opt?.[familyKeyI]?.italic ? FontCont.opt[familyKeyI].italic : FontCont.raw[familyKeyI].italic, + bold: useOpt && FontCont.opt?.[familyKeyI]?.bold ? FontCont.opt[familyKeyI].bold : FontCont.raw[familyKeyI].bold, + boldItalic: useOpt && FontCont.opt?.[familyKeyI]?.boldItalic ? FontCont.opt[familyKeyI].boldItalic : FontCont.raw[familyKeyI].boldItalic, + }; + await addFontFamilyRef(familyKeyI, familyObjI); + } + + if (FontCont.doc) { + for (const familyKeyI of Object.keys(FontCont.doc)) { + await addFontFamilyRef(familyKeyI, FontCont.doc[familyKeyI]); + } + } + + if (FontCont.supp.chi_sim) { + const charArr = getDistinctCharsFont(ocrArr, FontCont.supp.chi_sim.family); + + if (charArr.length > 0) { + const fontExport = await subsetFont(FontCont.supp.chi_sim.opentype, charArr); + + pdfFonts.NotoSansSC = {}; + pdfFonts.NotoSansSC.normal = { + type: 0, index: fontI, name: `/FO${String(fontI)}`, objN: objectI, opentype: fontExport, + }; + pdfFontRefs.push({ familyKey: 'NotoSansSC', key: 'normal' }); + pdfFontObjStrArr.push(null); + objectI += 6; + fontI++; + } + } + + // Add images [WIP] + /** @type {Array} */ + const pdfImageObjStrArr = []; + const imageObjIndices = []; + + if (includeImages && images && images.length > 0) { + const imageObjects = createEmbeddedImages(images, objectI); + for (let i = 0; i < imageObjects.length; i++) { + pdfImageObjStrArr.push(imageObjects[i]); + imageObjIndices.push(objectI + i); + } + objectI += imageObjects.length; + } + + /** @type {Array} */ + const pdfPageObjStrArr = []; + + // Add pages + const pageIndexArr = []; + for (let i = minpage; i <= maxpage; i++) { + const angle = pageMetricsArr[i].angle || 0; + const { dims } = pageMetricsArr[i]; + + // eslint-disable-next-line no-await-in-loop + const { pdfObj, pdfFontsUsed: pdfFontsUsedI } = (await ocrPageToPDF({ + pageObj: ocrArr[i], + inputDims: dims, + outputDims: dimsLimit, + firstObjIndex: objectI, + parentIndex: 2, + proofOpacity, + pdfFonts, + textMode, + angle, + rotateText, + rotateBackground, + confThreshHigh, + confThreshMed, + imageObjIndices, + includeImages, + })); + + for (const font of pdfFontsUsedI) { + pdfFontsUsed.add(font); + } + + for (let j = 0; j < pdfObj.length; j++) { + pdfPageObjStrArr.push(pdfObj[j]); + } + + // This assumes the "page" is always the first object returned by `ocrPageToPDF`. + pageIndexArr.push(objectI); + + objectI += pdfObj.length; + + opt.progressHandler({ n: i, type: 'export', info: { } }); + } + + // Create font objects for fonts that are used + for (const pdfFont of pdfFontsUsed) { + if (pdfFont.opentype?.names?.postScriptName?.en === 'NotoSansSC-Regular') continue; + const isStandardFont = false; + if (isStandardFont) { + pdfFontObjStrArr[pdfFont.index] = createEmbeddedFontType1(pdfFont.opentype, pdfFont.objN); + } else { + pdfFontObjStrArr[pdfFont.index] = createEmbeddedFontType0({ font: pdfFont.opentype, firstObjIndex: pdfFont.objN }); + } + } + + /** @type {Array} */ + const pdfObjStrArr = []; + + let pdfOut = '%PDF-1.7\n%µ¶n\n'; + + pdfObjStrArr.push('1 0 obj\n<>\nendobj\n\n'); + + let pagesObjStr = '2 0 obj\n<>\nendobj\n\n`; + + pdfObjStrArr.push(pagesObjStr); + + /** @type {{type: string, offset: number}[]} */ + const xrefArr = []; + for (let i = 0; i < pdfObjStrArr.length; i++) { + xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 }); + pdfOut += pdfObjStrArr[i]; + } + + for (let i = 0; i < pdfFontRefs.length; i++) { + if (pdfFontObjStrArr[i]) { + for (let j = 0; j < pdfFontObjStrArr[i].length; j++) { + xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 }); + pdfOut += pdfFontObjStrArr[i][j]; + } + } else { + xrefArr.push({ type: 'free', offset: 0 }); + xrefArr.push({ type: 'free', offset: 0 }); + xrefArr.push({ type: 'free', offset: 0 }); + xrefArr.push({ type: 'free', offset: 0 }); + xrefArr.push({ type: 'free', offset: 0 }); + xrefArr.push({ type: 'free', offset: 0 }); + } + } + + for (let i = 0; i < pdfImageObjStrArr.length; i++) { + xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 }); + pdfOut += pdfImageObjStrArr[i]; + } + + for (let i = 0; i < pdfPageObjStrArr.length; i++) { + xrefArr.push({ type: 'obj', offset: pdfOut.length + 2 }); + pdfOut += pdfPageObjStrArr[i]; + } + + // The 0th object always exists, and contains no meaningful data. + const objCount = pdfObjStrArr.length + pdfFontRefs.length * 6 + pdfImageObjStrArr.length + pdfPageObjStrArr.length + 1; + + const xrefOffset = pdfOut.length + 2; + + let xrefStr = `xref\n0 ${objCount}\n`; + + xrefStr += '0000000000 65535 f\n'; + + for (let i = 0; i < xrefArr.length; i++) { + if (xrefArr[i].type === 'obj') { + xrefStr += `${String(xrefArr[i].offset).padStart(10, '0')} 00000 n\n`; + } else { + xrefStr += '0000000000 65535 f\n'; + } + } + + xrefStr += `trailer + << /Root 1 0 R + /Size ${objCount} + >> +startxref +${xrefOffset} +%%EOF`; + + pdfOut += xrefStr; + + return pdfOut; +} + +/** + * Generates PDF objects for a single page of OCR data. + * Generally returns an array of 2 strings, the first being the text content object, and the second being the page object. + * If there is no text content, only the page object is returned. + * @param {Object} params - Parameters object + * @param {OcrPage} params.pageObj + * @param {dims} params.inputDims + * @param {dims} params.outputDims + * @param {number} params.firstObjIndex + * @param {number} params.parentIndex + * @param {number} params.proofOpacity + * @param {Object} params.pdfFonts + * @param {("ebook"|"eval"|"proof"|"invis")} params.textMode - + * @param {number} params.angle + * @param {boolean} [params.rotateText=false] + * @param {boolean} [params.rotateBackground=false] + * @param {number} [params.confThreshHigh=85] + * @param {number} [params.confThreshMed=75] + * @param {?import('opentype.js').Font} [params.fontChiSim=null] + * @param {Array} [params.imageObjIndices=[]] - Array of image object indices + * @param {boolean} [params.includeImages=false] - Whether to include images + */ +async function ocrPageToPDF({ + pageObj, + inputDims, + outputDims, + firstObjIndex, + parentIndex, + proofOpacity, + pdfFonts, + textMode, + angle, + rotateText = false, + rotateBackground = false, + confThreshHigh = 85, + confThreshMed = 75, + imageObjIndices = [], + includeImages = false, +}) { + if (outputDims.width < 1) { + outputDims = inputDims; + } + + const noTextContent = !pageObj || pageObj.lines.length === 0; + const noImageContent = !includeImages || imageObjIndices.length === 0; + + const pageIndex = firstObjIndex; + let pageObjStr = `${String(pageIndex)} 0 obj\n<>\nendobj\n\n`; + return { pdfObj: [pageObjStr], pdfFontsUsed: /** @type {Set} */ (new Set()) }; + } + + pageObjStr += `/Contents ${String(firstObjIndex + 2)} 0 R`; + + let imageContentObjStr = ''; + + if (includeImages && imageObjIndices.length > 0) { + if (imageObjIndices.length > 0) { + let rotation = 0; + if (rotateBackground && Math.abs(angle ?? 0) > 0.05) { + rotation = angle; + } + imageContentObjStr += drawImageCommands(0, 0, 0, outputDims.width, outputDims.height, rotation); + } + } + + const { textContentObjStr, pdfFontsUsed } = await ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle, + rotateText, rotateBackground, confThreshHigh, confThreshMed); + + let pdfFontsStr = ''; + for (const font of pdfFontsUsed) { + pdfFontsStr += `${String(font.name)} ${String(font.objN)} 0 R\n`; + } + + let resourceDictObjStr = `${String(firstObjIndex + 1)} 0 obj\n<<`; + + resourceDictObjStr += `/Font<<${pdfFontsStr}>>`; + + if (includeImages && imageObjIndices.length > 0) { + const imageResourceStr = createImageResourceDict(imageObjIndices); + resourceDictObjStr += imageResourceStr; + } + + // Use `GSO` prefix to avoid conflicts with other graphics states, which are normally named `/GS[n]` by convention. + resourceDictObjStr += '/ExtGState<<'; + resourceDictObjStr += '/GSO0 <>'; + resourceDictObjStr += `/GSO1 <>`; + resourceDictObjStr += '>>'; + + resourceDictObjStr += '>>\nendobj\n\n'; + + const pageResourceStr = `/Resources ${String(firstObjIndex + 1)} 0 R`; + + pageObjStr += `${pageResourceStr}/Parent ${parentIndex} 0 R>>\nendobj\n\n`; + + const pageContentObjStr = `${String(firstObjIndex + 2)} 0 obj\n<>\nstream\n${imageContentObjStr}${textContentObjStr}\nendstream\nendobj\n\n`; + + return { + pdfObj: [pageObjStr, resourceDictObjStr, pageContentObjStr], pdfFontsUsed, + }; +} diff --git a/js/export/pdf/writePdfFonts.js b/js/export/pdf/writePdfFonts.js new file mode 100644 index 0000000..893aa79 --- /dev/null +++ b/js/export/pdf/writePdfFonts.js @@ -0,0 +1,343 @@ +// Function for converting from bufferArray to hex (string) +// Taken from https://stackoverflow.com/questions/40031688/javascript-arraybuffer-to-hex + +import { win1252Chars } from '../../../fonts/encoding.js'; +import { determineSansSerif } from '../../utils/miscUtils.js'; + +/** @type {Array} */ +const byteToHex = []; + +for (let n = 0; n <= 0xff; ++n) { + const hexOctet = n.toString(16).padStart(2, '0'); + byteToHex.push(hexOctet); +} + +/** + * Converts an ArrayBuffer to a hexadecimal string. + * + * @param {ArrayBufferLike} arrayBuffer - The ArrayBuffer to be converted. + * @returns {string} The hexadecimal representation of the ArrayBuffer. + */ +export function hex(arrayBuffer) { + const buff = new Uint8Array(arrayBuffer); + let hexOctets = ''; + for (let i = 0; i < buff.length; ++i) { + if (i % 32 === 0 && i !== 0) hexOctets += '\n'; + hexOctets += byteToHex[buff[i]]; + } + + return hexOctets; +} + +/** + * Creates a ToUnicode CMap string for a font. + * The CMap maps character codes to Unicode values to enable text extraction. + * + * @param {import('opentype.js').Font} font - Opentype.js font object + * @returns {string} The ToUnicode CMap content string + */ +export function createToUnicode(font) { + let cmapStr = `/CIDInit /ProcSet findresource begin +12 dict begin +begincmap +/CIDSystemInfo +<< /Registry (Adobe) + /Ordering (UCS) + /Supplement 0 +>> def +/CMapName /Adobe-Identity-UCS def +/CMapType 2 def +1 begincodespacerange +<0000> +endcodespacerange\n`; + + // Get all glyphs and their unicode values + const entries = []; + for (let i = 0; i < font.glyphs.length; i++) { + const glyph = font.glyphs.glyphs[String(i)]; + if (glyph.unicode !== undefined) { + // Format the entry as: + const srcHex = i.toString(16).padStart(4, '0'); + const unicodeHex = glyph.unicode.toString(16).padStart(4, '0'); + entries.push(`<${srcHex}> <${unicodeHex}>`); + } + } + + // Write entries in chunks of 100 + const chunkSize = 100; + for (let i = 0; i < entries.length; i += chunkSize) { + const chunk = entries.slice(i, i + chunkSize); + cmapStr += `${chunk.length} beginbfchar\n`; + cmapStr += chunk.join('\n'); + cmapStr += '\nendbfchar\n'; + } + + cmapStr += `endcmap +CMapName currentdict /CMap defineresource pop +end +end`; + + return cmapStr; +} + +/** + * Generates the flags value for a PDF font descriptor. + * + * @param {boolean} serif - Whether the font has serifs. + * @param {boolean} italic - Whether the font is italicized. + * @param {boolean} smallcap - Whether the font uses small caps. + * @param {boolean} symbolic - Whether the font contains glyphs outside the Adobe standard Latin character set. + * @returns {number} The flags value as an unsigned 32-bit integer. + */ +const generateFontFlags = (serif, italic, smallcap, symbolic) => { /* eslint-disable no-bitwise */ + let flags = 0; + + // Set bits based on the input flags: + if (serif) flags |= (1 << 1); // Set bit 2 for serif + if (italic) flags |= (1 << 6); // Set bit 7 for italic + if (smallcap) flags |= (1 << 17); // Set bit 18 for smallcap + if (symbolic) { + flags |= (1 << 2); // Set bit 3 for symbolic + } else { + flags |= (1 << 5); // Set bit 6 for nonsymbolic + } + + return flags; +}; + +/** + * + * @param {opentype.Font} font - Opentype.js font object + * @param {number} objIndex - Index for font descriptor PDF object + * @param {boolean} italic + * @param {?number} embeddedObjIndex - Index for embedded font file PDF object. + * If not provided, the font will not be embedded in the PDF. + * @returns {string} The font descriptor object string. + */ +function createFontDescriptor(font, objIndex, italic, embeddedObjIndex = null) { + let objOut = `${String(objIndex)} 0 obj\n<} + */ +export function createEmbeddedFontType1(font, firstObjIndex, italic = false, isStandardFont = false) { + // Start 1st object: Font Dictionary + let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n< parseInt(x)); + + fontDictObjStr += '/Widths['; + for (let i = 0; i < win1252Chars.length; i++) { + const advance = font.charToGlyph(win1252Chars[i]).advanceWidth || font.unitsPerEm; + const advanceNorm = Math.round(advance * (1000 / font.unitsPerEm)); + fontDictObjStr += `${String(advanceNorm)} `; + } + fontDictObjStr += ']/FirstChar 32/LastChar 255'; + + fontDictObjStr += `/FontDescriptor ${String(firstObjIndex + 1)} 0 R>>\nendobj\n\n`; + + // Start 2nd object: Font Descriptor + const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, isStandardFont ? null : firstObjIndex + 2); + + // objOut += `${String(firstObjIndex + 1)} 0 obj\n<>\nstream\n`; + + fontFileObjStr += `${fontHexStr}\nendstream\nendobj\n\n`; + + return [fontDictObjStr, fontDescObjStr, fontFileObjStr]; +} + +/** + * Converts a Opentype.js font object into an array of strings for adding to a PDF. + * The font is represented as a composite "Type 0" font. + * + * @param {Object} options - Configuration object + * @param {opentype.Font} options.font - Opentype.js font object + * @param {number} options.firstObjIndex - Index for the first PDF object + * @param {boolean} [options.italic=false] - Whether the font is italic. + * + * This function does not produce "toUnicode" or "Widths" objects, + * so any PDF it creates directly will lack usable copy/paste. + * However, both of these objects will be created from the embedded file + * when the result is run through mupdf. + */ +export function createEmbeddedFontType0({ + font, firstObjIndex, italic = false, +}) { + // Start 1st object: Font Dictionary + let fontDictObjStr = `${String(firstObjIndex)} 0 obj\n<>\nstream\n`; + toUnicodeStr += toUnicodeStr0; + toUnicodeStr += '\nendstream\nendobj\n\n'; + + // Start 3rd object: FontDescriptor + const fontDescObjStr = createFontDescriptor(font, firstObjIndex + 1, italic, firstObjIndex + 3); + + // objOut += `${String(firstObjIndex + 2)} 0 obj\n`; + + // objOut += `<>`; + + // objOut += '\nendobj\n\n'; + + // Start 4th object: widths + let widthsObjStr = `${String(firstObjIndex + 2)} 0 obj\n`; + + // There are 2 ways to represent the widths of the glyphs in a CIDFontType2. + // (1) [first glyph index] [array of widths] + // (2) [first glyph index] [last glyph index] [single width for all glyphs in range] + // The smallest way to represent widths is to use both methods, + // with the second method used for ranges of glyphs with the same width. + // However, only the first method is used here, as mupdf rewrites the widths object. + // The widths object needs to be present and accurate, as otherwise the glyphs will not be displayed correctly, + // however it is not important that the widths be efficiently represented at this point. + widthsObjStr += '[ 0 ['; + for (let i = 0; i < font.glyphs.length; i++) { + const advanceNorm = Math.round(font.glyphs.glyphs[String(i)].advanceWidth * (1000 / font.unitsPerEm)); + widthsObjStr += `${String(advanceNorm)} `; + } + widthsObjStr += '] ]'; + + widthsObjStr += '\nendobj\n\n'; + + // Start 5th object: Font File + const fontBuffer = font.toArrayBuffer(); + const fontHexStr = hex(fontBuffer); + + let fontFileObjStr = `${String(firstObjIndex + 3)} 0 obj\n<>\nstream\n`; + + fontFileObjStr += `${fontHexStr}\nendstream\nendobj\n\n`; + + // Start 6th object: Font + let fontObjStr = `${String(firstObjIndex + 4)} 0 obj\n`; + + fontObjStr += '<>'; + + fontObjStr += `/BaseFont/${namesTable.postScriptName.en}/FontDescriptor ${String(firstObjIndex + 1)} 0 R`; + + fontObjStr += `/W ${String(firstObjIndex + 2)} 0 R`; + + fontObjStr += '>>\nendobj\n\n'; + + return [fontDictObjStr, fontDescObjStr, widthsObjStr, fontFileObjStr, fontObjStr, toUnicodeStr]; +} diff --git a/js/export/pdf/writePdfImages.js b/js/export/pdf/writePdfImages.js new file mode 100644 index 0000000..868b53f --- /dev/null +++ b/js/export/pdf/writePdfImages.js @@ -0,0 +1,218 @@ +/* eslint-disable no-bitwise */ +import { imageUtils } from '../../objects/imageObjects.js'; +import { base64ToBytes, getPngIHDRInfo } from '../../utils/imageUtils.js'; +import { hex } from './writePdfFonts.js'; + +/** + * Extracts the concatenated data from all IDAT chunks of a PNG file. + * @param {Uint8Array} pngBytes - The raw bytes of the PNG file. + * @returns {Uint8Array} The concatenated zlib-compressed image data. + */ +function extractPngIdatData(pngBytes) { + // PNG signature + const signature = [137, 80, 78, 71, 13, 10, 26, 10]; + for (let i = 0; i < 8; i++) { + if (pngBytes[i] !== signature[i]) { + throw new Error('Invalid PNG file signature'); + } + } + + let offset = 8; + const idatChunks = []; + + while (offset < pngBytes.length) { + // Read chunk length directly from bytes (big-endian) + const length = (pngBytes[offset] << 24) + | (pngBytes[offset + 1] << 16) + | (pngBytes[offset + 2] << 8) + | pngBytes[offset + 3]; + offset += 4; + + const type = String.fromCharCode( + pngBytes[offset], + pngBytes[offset + 1], + pngBytes[offset + 2], + pngBytes[offset + 3], + ); + offset += 4; + + if (type === 'IDAT') { + idatChunks.push(pngBytes.subarray(offset, offset + length)); + } else if (type === 'IEND') { + break; + } + + offset += length + 4; // Skip data and CRC + } + + if (idatChunks.length === 0) { + console.warn('No IDAT chunks found in PNG image.'); + return pngBytes; // Fallback if no IDAT chunks are found + } + + const totalLength = idatChunks.reduce((acc, chunk) => acc + chunk.length, 0); + const concatenated = new Uint8Array(totalLength); + let currentOffset = 0; + for (const chunk of idatChunks) { + concatenated.set(chunk, currentOffset); + currentOffset += chunk.length; + } + + return concatenated; +} + +/** + * Creates PDF XObject for an .jpeg image + * @param {number} objIndex - PDF object index + * @param {ArrayBufferLike} imageData - Raw image data + * @param {number} width - Image width + * @param {number} height - Image height + * @returns {string} PDF XObject string + */ +const createImageXObjectJpeg = (objIndex, imageData, width, height) => { + const imageBytes = new Uint8Array(imageData); + let objStr = `${String(objIndex)} 0 obj\n`; + objStr += '< { + const imageBytes = new Uint8Array(imageData); + let objStr = `${String(objIndex)} 0 obj\n`; + objStr += '< { + const objIndex = firstObjIndex + index; + const dims = imageUtils.getDims(image); + const imageBytes = base64ToBytes(image.src); + let objParts; + if (image.format === 'jpeg') { + objParts = createImageXObjectJpeg(objIndex, imageBytes.buffer, dims.width, dims.height); + } else { + objParts = createImageXObjectPng(objIndex, imageBytes.buffer); + } + imageObjArr.push(objParts); + }); + + return imageObjArr; +} + +/** + * Creates a resource dictionary entry for images + * @param {Array} imageObjIndices - Array of image object indices + * @returns {string} Resource dictionary XObject entries + */ +export function createImageResourceDict(imageObjIndices) { + if (imageObjIndices.length === 0) return ''; + + let resourceStr = '/XObject<<'; + imageObjIndices.forEach((objIndex, i) => { + resourceStr += `/Im${String(i)} ${String(objIndex)} 0 R\n`; + }); + resourceStr += '>>'; + + return resourceStr; +} + +/** + * Generates PDF drawing commands to place an image on a page with optional rotation + * @param {number} imageIndex - Index of the image (for /Im naming) + * @param {number} x - X position + * @param {number} y - Y position + * @param {number} width - Display width + * @param {number} height - Display height + * @param {number} rotation - Rotation angle in degrees (default: 0) + * @returns {string} PDF drawing commands + */ +export function drawImageCommands(imageIndex, x, y, width, height, rotation = 0) { + const angle = (rotation * Math.PI) / 180; + + const centerX = x + width / 2; + const centerY = y + height / 2; + + const cos = Math.cos(angle); + const sin = Math.sin(angle); + + const a = width * cos; + const b = width * sin; + const c = -height * sin; + const d = height * cos; + + const e = centerX - (width * cos - height * sin) / 2; + const f = centerY - (width * sin + height * cos) / 2; + + return `q\n${a} ${b} ${c} ${d} ${e} ${f} cm\n/Im${imageIndex} Do\nQ\n`; +} diff --git a/js/export/pdf/writePdfText.js b/js/export/pdf/writePdfText.js new file mode 100644 index 0000000..966d9dc --- /dev/null +++ b/js/export/pdf/writePdfText.js @@ -0,0 +1,377 @@ +import { FontCont } from '../../containers/fontContainer.js'; +import ocr from '../../objects/ocrObjects.js'; +import { calcWordMetrics } from '../../utils/fontUtils.js'; +import { getStyleLookup } from '../../utils/miscUtils.js'; + +/** + * @param {number} x + */ +const formatNum = (x) => String(Math.round(x * 1e6) / 1e6); + +/** + * + * @param {OcrPage} pageObj + * @param {dims} outputDims + * @param {Object} pdfFonts + * @param {("ebook"|"eval"|"proof"|"invis")} textMode - + * @param {number} angle + * @param {boolean} rotateText + * @param {boolean} rotateBackground + * @param {number} confThreshHigh + * @param {number} confThreshMed + */ +export async function ocrPageToPDFStream(pageObj, outputDims, pdfFonts, textMode, angle, + rotateText = false, rotateBackground = false, confThreshHigh = 85, confThreshMed = 75) { + if (!pageObj || pageObj.lines.length === 0) { + return { textContentObjStr: '', pdfFontsUsed: new Set() }; + } + + const cosAnglePage = Math.cos(angle * (Math.PI / 180)); + + /** @type {Set} */ + const pdfFontsUsed = new Set(); + + const underlines = /** @type {Array<{left: number, right: number, top: number, height: number, fontSize: number, bold: boolean}>} */ ([]); + + // Start 1st object: Text Content + let textContentObjStr = ''; + + if (textMode === 'invis') { + textContentObjStr += '/GSO0 gs\n'; + } else if (['proof', 'eval'].includes(textMode)) { + textContentObjStr += '/GSO1 gs\n'; + } + + textContentObjStr += 'BT\n'; + + // Move cursor to top of the page + textContentObjStr += `1 0 0 1 0 ${String(outputDims.height)} Tm\n`; + + let pdfFontNameCurrent = ''; + let pdfFontTypeCurrent = 0; + + for (let i = 0; i < pageObj.lines.length; i++) { + const lineObj = pageObj.lines[i]; + const { words } = lineObj; + + if (words.length === 0) continue; + + let wordJ = words[0]; + + let fillColor = '0 0 0 rg'; + if (textMode === 'proof') { + if (wordJ.conf > confThreshHigh) { + fillColor = '0 1 0.5 rg'; + } else if (wordJ.conf > confThreshMed) { + fillColor = '1 0.8 0 rg'; + } else { + fillColor = '1 0 0 rg'; + } + } + + const angleAdjLine = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? ocr.calcLineStartAngleAdj(lineObj) : { x: 0, y: 0 }; + + let fillColorCurrent = fillColor; + + textContentObjStr += `${fillColor}\n`; + + let wordFont = FontCont.getWordFont(wordJ); + + const word0Metrics = calcWordMetrics(wordJ, angle); + + let wordFontSize = word0Metrics.fontSize; + + // Set font and font size + const pdfFontCurrent = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)]; + pdfFontNameCurrent = pdfFontCurrent.name; + pdfFontTypeCurrent = pdfFontCurrent.type; + pdfFontsUsed.add(pdfFontCurrent); + + textContentObjStr += `${pdfFontNameCurrent} ${String(wordFontSize)} Tf\n`; + + // Reset baseline to line baseline + textContentObjStr += '0 Ts\n'; + + const word0LeftBearing = wordJ.visualCoords ? word0Metrics.leftSideBearing : 0; + + let tz = 100; + if (wordJ.style.dropcap) { + const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left; + tz = (wordWidthActual / word0Metrics.visualWidth) * 100; + } + + // Move to next line + const lineLeftAdj = wordJ.bbox.left - word0LeftBearing * (tz / 100) + angleAdjLine.x; + const lineTopAdj = lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y; + + const lineAngleDeg = Number(rotateText) * angle + 90 * lineObj.orientation; + + const sinAngleTm = Math.sin(lineAngleDeg * (Math.PI / 180)); + const cosAngleTm = Math.cos(lineAngleDeg * (Math.PI / 180)); + + if (lineObj.orientation === 1) { + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(outputDims.width - lineTopAdj + 1)} ${formatNum(outputDims.height - lineLeftAdj)} Tm\n`; + } else if (lineObj.orientation === 2) { + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(outputDims.width - lineLeftAdj + 1)} ${formatNum(lineTopAdj)} Tm\n`; + } else if (lineObj.orientation === 3) { + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(lineTopAdj)} ${formatNum(lineLeftAdj)} Tm\n`; + } else { + textContentObjStr += `${formatNum(cosAngleTm)} ${formatNum(-sinAngleTm)} ${formatNum(sinAngleTm)} ${formatNum(cosAngleTm)} ${formatNum(lineLeftAdj)} ${formatNum(outputDims.height - lineTopAdj + 1)} Tm\n`; + } + + textContentObjStr += '[ '; + + let wordBoxLast = { + left: 0, top: 0, right: 0, bottom: 0, + }; + let wordRightBearingLast = 0; + let charSpacingLast = 0; + let spacingAdj = 0; + let kernSpacing = false; + let wordLast = wordJ; + let underlineLeft = /** @type {?number} */ null; + let underlineRight = /** @type {?number} */ null; + let wordFontOpentypeLast = wordFont.opentype; + let fontSizeLast = wordFontSize; + let tsCurrent = 0; + let tzCurrent = 100; + let charLig = false; + + for (let j = 0; j < words.length; j++) { + wordJ = words[j]; + + const wordMetrics = calcWordMetrics(wordJ, angle); + wordFontSize = wordMetrics.fontSize; + const charSpacing = wordMetrics.charSpacing; + const charArr = wordMetrics.charArr; + const wordLeftBearing = wordJ.visualCoords ? wordMetrics.leftSideBearing : 0; + const kerningArr = wordMetrics.kerningArr; + + wordFont = FontCont.getWordFont(wordJ); + + fillColor = '0 0 0 rg'; + if (textMode === 'proof') { + const wordConf = wordJ.conf; + + if (wordConf > confThreshHigh) { + fillColor = '0 1 0.5 rg'; + } else if (wordConf > confThreshMed) { + fillColor = '1 0.8 0 rg'; + } else { + fillColor = '1 0 0 rg'; + } + } else if (textMode === 'eval') { + fillColor = wordJ.matchTruth ? '0 1 0.5 rg' : '1 0 0 rg'; + } + + const angleAdjWord = wordJ.style.sup ? ocr.calcWordAngleAdj(wordJ) : { x: 0, y: 0 }; + const angleAdjWordX = (rotateBackground && Math.abs(angle ?? 0) > 0.05) ? angleAdjWord.x : 0; + + let ts = 0; + if (wordJ.style.sup || wordJ.style.dropcap) { + ts = (lineObj.bbox.bottom + lineObj.baseline[1] + angleAdjLine.y) - (wordJ.bbox.bottom + angleAdjLine.y + angleAdjWord.y); + if (!wordJ.visualCoords) { + const fontDesc = wordFont.opentype.descender / wordFont.opentype.unitsPerEm * wordMetrics.fontSize; + ts -= fontDesc; + } + } else { + ts = 0; + } + + // TODO: This probably fails for Chinese, rethink. + tz = 100; + if (wordJ.style.dropcap) { + const wordWidthActual = wordJ.bbox.right - wordJ.bbox.left; + tz = (wordWidthActual / wordMetrics.visualWidth) * 100; + } + + const pdfFont = wordJ.lang === 'chi_sim' ? pdfFonts.NotoSansSC.normal : pdfFonts[wordFont.family][getStyleLookup(wordJ.style)]; + const pdfFontName = pdfFont.name; + const pdfFontType = pdfFont.type; + pdfFontsUsed.add(pdfFont); + + const wordWidthAdj = (wordJ.bbox.right - wordJ.bbox.left) / cosAnglePage; + const wordSpaceAdj = (wordJ.bbox.left - wordBoxLast.right) / cosAnglePage; + + // Add space character between words + if (j > 0 && !kernSpacing) { + // The space between words determined by: + // (1) The right bearing of the last word, (2) the left bearing of the current word, (3) the width of the space character between words, + // (4) the current character spacing value (applied twice--both before and after the space character). + const spaceAdvance = wordFontOpentypeLast.charToGlyph(' ').advanceWidth || wordFontOpentypeLast.unitsPerEm / 2; + const spaceWidthGlyph = spaceAdvance * (fontSizeLast / wordFontOpentypeLast.unitsPerEm); + + const wordSpaceExpectedPx = (spaceWidthGlyph + charSpacingLast * 2 + wordRightBearingLast) + wordLeftBearing; + + // Ad-hoc adjustment needed to replicate wordSpace + // const wordSpaceExtra = (wordSpace + angleSpaceAdjXWord - spaceWidth - charSpacing * 2 - wordLeftBearing - wordRightBearingLast + spacingAdj); + const wordSpaceExtraPx = (wordSpaceAdj - wordSpaceExpectedPx + spacingAdj + angleAdjWordX) * (100 / tzCurrent); + + if (pdfFontTypeCurrent === 0) { + const spaceChar = wordFont.opentype.charToGlyphIndex(' ').toString(16).padStart(4, '0'); + textContentObjStr += `<${spaceChar}> ${String(Math.round(wordSpaceExtraPx * (-1000 / fontSizeLast) * 1e6) / 1e6)}`; + } else { + textContentObjStr += `( ) ${String(Math.round(wordSpaceExtraPx * (-1000 / fontSizeLast) * 1e6) / 1e6)}`; + } + } + kernSpacing = false; + + wordBoxLast = wordJ.bbox; + + // In general, we assume that (given our adjustments to character spacing) the rendered word has the same width as the image of that word. + // However, this assumption does not hold for single-character words, as there is no space between character to adjust. + // Therefore, we calculate the difference between the rendered and actual word and apply an adjustment to the width of the next space. + // (This does not apply to drop caps as those have horizontal scaling applied to exactly match the image.) + if (charArr.length === 1 && !wordJ.style.dropcap) { + const wordLastGlyph = wordFont.opentype.charToGlyph(charArr.at(-1)); + const wordLastGlyphMetrics = wordLastGlyph.getMetrics(); + const lastCharAdvance = wordLast.visualCoords ? (wordLastGlyphMetrics.xMax - wordLastGlyphMetrics.xMin) : wordLastGlyph.advanceWidth || wordFont.opentype.unitsPerEm / 2; + const lastCharWidth = lastCharAdvance * (wordFontSize / wordFont.opentype.unitsPerEm); + spacingAdj = wordWidthAdj - lastCharWidth - angleAdjWordX; + } else { + spacingAdj = 0 - angleAdjWordX; + } + + textContentObjStr += ' ] TJ\n'; + + const fontSize = wordJ.style.smallCaps && wordJ.text[0] && wordJ.text[0] !== wordJ.text[0].toUpperCase() ? wordFontSize * wordFont.smallCapsMult : wordFontSize; + if (pdfFontName !== pdfFontNameCurrent || fontSize !== fontSizeLast) { + textContentObjStr += `${pdfFontName} ${String(fontSize)} Tf\n`; + pdfFontNameCurrent = pdfFontName; + pdfFontTypeCurrent = pdfFontType; + fontSizeLast = fontSize; + } + if (fillColor !== fillColorCurrent) { + textContentObjStr += `${fillColor}\n`; + fillColorCurrent = fillColor; + } + if (ts !== tsCurrent) { + textContentObjStr += `${String(ts)} Ts\n`; + tsCurrent = ts; + } + if (tz !== tzCurrent) { + textContentObjStr += `${String(tz)} Tz\n`; + tzCurrent = tz; + } + + textContentObjStr += `${String(Math.round(charSpacing * 1e6) / 1e6)} Tc\n`; + + textContentObjStr += '[ '; + + // Non-ASCII and special characters are encoded/escaped using winEncodingLookup + for (let k = 0; k < charArr.length; k++) { + const letterSrc = charArr[k]; + const letter = wordJ.style.smallCaps ? charArr[k].toUpperCase() : charArr[k]; + const fontSizeLetter = wordJ.style.smallCaps && letterSrc !== letter ? wordFontSize * wordFont.smallCapsMult : wordFontSize; + + // Encoding needs to come from `pdfFont`, not `wordFont`, as the `pdfFont` will have a different index when subset. + const letterEnc = pdfFontTypeCurrent === 0 ? pdfFont.opentype.charToGlyphIndex(letter)?.toString(16).padStart(4, '0') : winEncodingLookup[letter]; + if (letterEnc) { + let kern = (kerningArr[k] || 0) * (-1000 / fontSizeLetter); + + if (wordJ.lang === 'chi_sim' && j + 1 < words.length && words[j + 1].lang === 'chi_sim') { + kernSpacing = true; + const wordNext = words[j + 1]; + const wordSpaceNextAdj = (wordNext.bbox.left - wordJ.bbox.right) / cosAngleTm; + // const wordSpaceNextAdj = wordNext.bbox.left - wordBox.right; + + const wordGlyph = wordFont.opentype.charToGlyph(charArr.at(-1)); + const wordGlyphMetrics = wordGlyph.getMetrics(); + const wordNextGlyphMetrics = wordFont.opentype.charToGlyph(wordNext.text.substr(0, 1)).getMetrics(); + + const wordRightBearing = wordJ.visualCoords ? (wordGlyph.advanceWidth - wordGlyphMetrics.xMax) * (wordFontSize / wordFont.opentype.unitsPerEm) : 0; + + const wordNextLeftBearing = wordNext.visualCoords ? wordNextGlyphMetrics.xMin * (wordFontSize / wordFont.opentype.unitsPerEm) : 0; + + const wordSpaceExpected = charSpacing + wordRightBearing + wordNextLeftBearing; + + kern = Math.round((wordSpaceNextAdj - wordSpaceExpected + spacingAdj + angleAdjWordX) * (-1000 / wordFontSize)); + } + + // PDFs render text based on a "widths" PDF object, rather than the advance width in the embedded font file. + // The widths are in 1/1000 of a unit, and this PDF object is created by mupdf. + // The widths output in this object are converted to integers, which creates a rounding error when the font em size is not 1000. + // All built-in fonts are already 1000 to avoid this, however custom fonts may not be. + // This results in a small rounding error for the advance of each character, which adds up, as PDF positioning is cumulative. + // To correct for this, the error is calculated and added to the kerning value. + const charAdvance = wordFont.opentype.charToGlyph(letter).advanceWidth; + const charWidthPdfPrecise = charAdvance * (1000 / wordFont.opentype.unitsPerEm); + const charWidthPdfRound = Math.floor(charWidthPdfPrecise); + const charWidthError = charWidthPdfRound - charWidthPdfPrecise; + + const charAdj = kern + charWidthError; + + if (pdfFontName !== pdfFontNameCurrent || fontSizeLetter !== fontSizeLast) { + textContentObjStr += ' ] TJ\n'; + textContentObjStr += `${pdfFontName} ${String(fontSizeLetter)} Tf\n`; + fontSizeLast = fontSizeLetter; + textContentObjStr += `${String(Math.round(charSpacing * 1e6) / 1e6)} Tc\n`; + textContentObjStr += '[ '; + } + + if (pdfFontTypeCurrent === 0) { + textContentObjStr += `<${letterEnc}> ${String(Math.round(charAdj * 1e6) / 1e6)} `; + } else { + textContentObjStr += `(${letterEnc}) ${String(Math.round(kern * 1e6) / 1e6)} `; + } + + if (charLig) { + k++; + charLig = false; + } + } else { + // When the requested character could not be found, a space is inserted, with extra space to match the width of the missing character + const kern = (wordFont.opentype.charToGlyph(letter).advanceWidth - wordFont.opentype.charToGlyph(' ').advanceWidth) * (-1000 / wordFont.opentype.unitsPerEm) || 0; + + if (pdfFontTypeCurrent === 0) { + const spaceChar = wordFont.opentype.charToGlyphIndex(' ').toString(16).padStart(4, '0'); + textContentObjStr += `<${spaceChar}> ${String(Math.round(kern * 1e6) / 1e6)} `; + } else { + textContentObjStr += `( ) ${String(Math.round(kern * 1e6) / 1e6)} `; + } + } + } + + if (wordJ.style.underline && underlineLeft === null) { + underlineLeft = wordJ.bbox.left; + } + + if (wordJ.style.underline) { + underlineRight = wordJ.bbox.right; + } + + if (underlineLeft !== null && (!wordJ.style.underline || j === words.length - 1)) { + underlines.push({ + left: underlineLeft, + right: underlineRight, + top: lineTopAdj, + height: lineObj.bbox.bottom - lineObj.bbox.top, + fontSize: wordFontSize, + bold: wordJ.style.bold, + }); + + underlineLeft = null; + underlineRight = null; + } + + wordLast = wordJ; + wordRightBearingLast = wordLast.visualCoords ? wordMetrics.rightSideBearing : 0; + wordFontOpentypeLast = wordFont.opentype; + charSpacingLast = charSpacing; + } + + textContentObjStr += ' ] TJ\n'; + } + + textContentObjStr += 'ET'; + + // Add underlines + underlines.forEach((underline) => { + const underlineThickness = underline.bold ? Math.ceil(underline.fontSize / 12) : Math.ceil(underline.fontSize / 24); + const underlineOffset = Math.ceil(underline.fontSize / 12) + underlineThickness; + + textContentObjStr += `\n${String(underline.left)} ${String(outputDims.height - underline.top - underlineOffset)} ${String(underline.right - underline.left)} ${underlineThickness} re\nf\n`; + }); + + return { textContentObjStr, pdfFontsUsed }; +} diff --git a/js/export/writeDocx.js b/js/export/writeDocx.js index 1d68012..6a5be61 100644 --- a/js/export/writeDocx.js +++ b/js/export/writeDocx.js @@ -7,11 +7,12 @@ import { opt } from '../containers/app.js'; /** * Create a Word document from an array of ocrPage objects. * - * @param {Array} hocrCurrent - - * @param {number} minpage - The first page to include in the document. - * @param {number} maxpage - The last page to include in the document. + * @param {Object} params + * @param {Array} params.hocrCurrent - + * @param {number} [params.minpage=0] - The first page to include in the document. + * @param {number} [params.maxpage=-1] - The last page to include in the document. */ -export async function writeDocx(hocrCurrent, minpage = 0, maxpage = -1) { +export async function writeDocx({ hocrCurrent, minpage = 0, maxpage = -1 }) { const { Uint8ArrayWriter, TextReader, ZipWriter } = await import('../../lib/zip.js/index.js'); if (maxpage === -1) maxpage = hocrCurrent.length - 1; @@ -19,7 +20,13 @@ export async function writeDocx(hocrCurrent, minpage = 0, maxpage = -1) { const zipFileWriter = new Uint8ArrayWriter(); const zipWriter = new ZipWriter(zipFileWriter); - const textReader = new TextReader(documentStart + writeText(hocrCurrent, minpage, maxpage, opt.reflow, true) + documentEnd); + const textReader = new TextReader(documentStart + writeText({ + ocrCurrent: hocrCurrent, + minpage, + maxpage, + reflowText: opt.reflow, + docxMode: true, + }) + documentEnd); await zipWriter.add('word/document.xml', textReader); for (let i = 0; i < docxStrings.length; i++) { diff --git a/js/export/writeHocr.js b/js/export/writeHocr.js index 9ad518e..dd0e79f 100644 --- a/js/export/writeHocr.js +++ b/js/export/writeHocr.js @@ -1,6 +1,6 @@ import { opt } from '../containers/app.js'; import { - layoutDataTables, layoutRegions, pageMetricsArr, + layoutDataTables, layoutRegions, pageMetricsAll, } from '../containers/dataContainer.js'; import { FontCont } from '../containers/fontContainer.js'; import ocr from '../objects/ocrObjects.js'; @@ -8,11 +8,12 @@ import { round6 } from '../utils/miscUtils.js'; /** * - * @param {Array} ocrData - * @param {number} [minValue] - * @param {number} [maxValue] + * @param {Object} params + * @param {Array} params.ocrData + * @param {number} [params.minValue] + * @param {number} [params.maxValue] */ -export function writeHocr(ocrData, minValue, maxValue) { +export function writeHocr({ ocrData, minValue, maxValue }) { if (minValue === null || minValue === undefined) minValue = 0; if (maxValue === null || maxValue === undefined || maxValue < 0) maxValue = ocrData.length - 1; @@ -51,7 +52,7 @@ export function writeHocr(ocrData, minValue, maxValue) { // Handle case where ocrPage object does not exist. if (!pageObj) { - hocrOut += `\n\t
`; + hocrOut += `\n\t
`; hocrOut += '\n\t
'; continue; } diff --git a/js/export/writeHtml.js b/js/export/writeHtml.js index 2067fd1..9231600 100644 --- a/js/export/writeHtml.js +++ b/js/export/writeHtml.js @@ -2,7 +2,7 @@ import { FontCont } from '../containers/fontContainer.js'; import { opt } from '../containers/app.js'; import { calcWordMetrics } from '../utils/fontUtils.js'; import { assignParagraphs } from '../utils/reflowPars.js'; -import { pageMetricsArr } from '../containers/dataContainer.js'; +import { pageMetricsAll } from '../containers/dataContainer.js'; import ocr from '../objects/ocrObjects.js'; const formatNum = (num) => (num.toFixed(5).replace(/\.?0+$/, '')); @@ -132,14 +132,14 @@ export function writeHtml({ } if (removeMargins) { - top += Math.min((maxBottom - minTop) + 200, pageMetricsArr[g].dims.height + 10); + top += Math.min((maxBottom - minTop) + 200, pageMetricsAll[g].dims.height + 10); } else { - top += pageMetricsArr[g].dims.height + 10; + top += pageMetricsAll[g].dims.height + 10; } // Do not overwrite paragraphs from Abbyy or Textract. if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) { - const angle = pageMetricsArr[g].angle || 0; + const angle = pageMetricsAll[g].angle || 0; assignParagraphs(pageObj, angle); } diff --git a/js/export/writeTabular.js b/js/export/writeTabular.js index 6ff5e26..a4d57bb 100644 --- a/js/export/writeTabular.js +++ b/js/export/writeTabular.js @@ -4,19 +4,24 @@ import { inputData, opt } from '../containers/app.js'; import { extractTableContent } from '../extractTables.js'; /** - * @param {ReturnType} tableWordObj - * @param {Array} extraCols - * @param {number} startRow - * @param {boolean} xlsxMode - * @param {boolean} htmlMode + * @param {Object} params + * @param {ReturnType} params.tableWordObj + * @param {Array} [params.extraCols=[]] + * @param {number} [params.startRow=0] + * @param {boolean} [params.xlsxMode=true] + * @param {boolean} [params.htmlMode=false] */ -export function createCells(tableWordObj, extraCols = [], startRow = 0, xlsxMode = true, htmlMode = false) { +export function createCells({ + tableWordObj, extraCols = [], startRow = 0, xlsxMode = true, htmlMode = false, +}) { let textStr = ''; let rowIndex = startRow; let rowCount = 0; for (const [key, value] of Object.entries(tableWordObj)) { - const cellsSingle = createCellsSingle(value.rowWordArr, extraCols, rowIndex, xlsxMode, htmlMode); + const cellsSingle = createCellsSingle({ + ocrTableWords: value.rowWordArr, extraCols, startRow: rowIndex, xlsxMode, htmlMode, + }); textStr += cellsSingle.content; rowIndex += cellsSingle.rows; rowCount += cellsSingle.rows; @@ -27,14 +32,17 @@ export function createCells(tableWordObj, extraCols = [], startRow = 0, xlsxMode /** * Convert a single table into HTML or Excel XML rows - * @param {ReturnType['rowWordArr']} ocrTableWords - * @param {Array} extraCols - * @param {number} startRow - * @param {boolean} xlsxMode - * @param {boolean} htmlMode - * @param {boolean} previewMode + * @param {Object} params + * @param {ReturnType['rowWordArr']} params.ocrTableWords + * @param {Array} [params.extraCols=[]] + * @param {number} [params.startRow=0] + * @param {boolean} [params.xlsxMode=true] + * @param {boolean} [params.htmlMode=false] + * @param {boolean} [params.previewMode=true] */ -function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode = true, htmlMode = false, previewMode = true) { +function createCellsSingle({ + ocrTableWords, extraCols = [], startRow = 0, xlsxMode = true, htmlMode = false, previewMode = true, +}) { const letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']; let textStr = htmlMode ? '' : ''; @@ -140,12 +148,15 @@ function createCellsSingle(ocrTableWords, extraCols = [], startRow = 0, xlsxMode /** * - * @param {Array} ocrPageArr - * @param {Array} layoutPageArr - * @param {number} minpage - * @param {number} maxpage + * @param {Object} params + * @param {Array} params.ocrPageArr + * @param {Array} params.layoutPageArr + * @param {number} [params.minpage=0] + * @param {number} [params.maxpage=-1] */ -export async function writeXlsx(ocrPageArr, layoutPageArr, minpage = 0, maxpage = -1) { +export async function writeXlsx({ + ocrPageArr, layoutPageArr, minpage = 0, maxpage = -1, +}) { const { xlsxStrings, sheetStart, sheetEnd } = await import('./resources/xlsxFiles.js'); const { Uint8ArrayWriter, TextReader, ZipWriter } = await import('../../lib/zip.js/index.js'); @@ -169,7 +180,7 @@ export async function writeXlsx(ocrPageArr, layoutPageArr, minpage = 0, maxpage if (opt.xlsxPageNumberColumn) extraCols.push(String(i + 1)); const tableWordObj = extractTableContent(ocrPageArr[i], layoutPageArr[i]); - const cellsObj = createCells(tableWordObj, extraCols, rowCount); + const cellsObj = createCells({ tableWordObj, extraCols, startRow: rowCount }); rowCount += cellsObj.rows; sheetContent += cellsObj.content; opt.progressHandler({ n: i, type: 'export', info: { } }); diff --git a/js/export/writeText.js b/js/export/writeText.js index 73052f2..2900ac0 100644 --- a/js/export/writeText.js +++ b/js/export/writeText.js @@ -1,20 +1,21 @@ import { opt } from '../containers/app.js'; -import { pageMetricsArr } from '../containers/dataContainer.js'; +import { pageMetricsAll } from '../containers/dataContainer.js'; import ocr from '../objects/ocrObjects.js'; import { assignParagraphs } from '../utils/reflowPars.js'; /** * Convert an array of ocrPage objects to plain text, or XML for a Word document. * - * @param {Array} ocrCurrent - - * @param {number} minpage - The first page to include in the document. - * @param {number} maxpage - The last page to include in the document. - * @param {boolean} reflowText - Remove line breaks within what appears to be the same paragraph. - * @param {boolean} docxMode - Create XML for a word document rather than plain text. - * @param {?Array} wordIds - An array of word IDs to include in the document. + * @param {Object} params + * @param {Array} params.ocrCurrent - + * @param {number} [params.minpage=0] - The first page to include in the document. + * @param {number} [params.maxpage=-1] - The last page to include in the document. + * @param {boolean} [params.reflowText=false] - Remove line breaks within what appears to be the same paragraph. + * @param {boolean} [params.docxMode=false] - Create XML for a word document rather than plain text. + * @param {?Array} [params.wordIds=null] - An array of word IDs to include in the document. * If omitted, all words are included. */ -export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null) { +export function writeText({ ocrCurrent, minpage = 0, maxpage = -1, reflowText = false, docxMode = false, wordIds = null }) { let textStr = ''; if (maxpage === -1) maxpage = ocrCurrent.length - 1; @@ -28,7 +29,7 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa // Do not overwrite paragraphs from Abbyy or Textract. if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) { - const angle = pageMetricsArr[g].angle || 0; + const angle = pageMetricsAll[g].angle || 0; assignParagraphs(pageObj, angle); } diff --git a/js/fontEval.js b/js/fontEval.js index 1c116d7..269b503 100644 --- a/js/fontEval.js +++ b/js/fontEval.js @@ -1,4 +1,4 @@ -import { pageMetricsArr } from './containers/dataContainer.js'; +import { pageMetricsAll } from './containers/dataContainer.js'; import { FontCont } from './containers/fontContainer.js'; import { ImageCache } from './containers/imageContainer.js'; import { @@ -28,7 +28,7 @@ export async function evalPagesFont(font, pageArr, opt, n = 500) { font, page: pageArr[i], binaryImage: imageI, - pageMetricsObj: pageMetricsArr[i], + pageMetricsObj: pageMetricsAll[i], opt, }); diff --git a/js/generalWorkerMain.js b/js/generalWorkerMain.js index 03d4352..3c5d410 100644 --- a/js/generalWorkerMain.js +++ b/js/generalWorkerMain.js @@ -96,6 +96,7 @@ export async function initGeneralWorker() { obj.convertPageAbbyy = wrap('convertPageAbbyy'); obj.convertPageStext = wrap('convertPageStext'); obj.convertDocTextract = wrap('convertDocTextract'); + obj.convertPageGoogleVision = wrap('convertPageGoogleVision'); obj.convertPageText = wrap('convertPageText'); obj.optimizeFont = wrap('optimizeFont'); @@ -168,37 +169,64 @@ export class gs { * @param {Parameters[0]} args * @returns {ReturnType} */ - static convertPageHocr = async (args) => (await gs.schedulerInner.addJob('convertPageHocr', args)); + static convertPageHocr = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertPageHocr', args); + }; /** * @param {Parameters[0]} args * @returns {ReturnType} */ - static convertPageAbbyy = async (args) => (await gs.schedulerInner.addJob('convertPageAbbyy', args)); + static convertPageAbbyy = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertPageAbbyy', args); + }; /** * @param {Parameters[0]} args * @returns {ReturnType} */ - static convertDocTextract = async (args) => (await gs.schedulerInner.addJob('convertDocTextract', args)); + static convertDocTextract = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertDocTextract', args); + }; + + /** + * @param {Parameters[0]} args + * @returns {ReturnType} + */ + static convertPageGoogleVision = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertPageGoogleVision', args); + }; /** * @param {Parameters[0]} args * @returns {ReturnType} */ - static convertPageStext = async (args) => (await gs.schedulerInner.addJob('convertPageStext', args)); + static convertPageStext = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertPageStext', args); + }; /** * @param {Parameters[0]} args * @returns {ReturnType} */ - static convertPageText = async (args) => (await gs.schedulerInner.addJob('convertPageText', args)); + static convertPageText = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('convertPageText', args); + }; /** * @param {Parameters[0]} args * @returns {ReturnType} */ - static optimizeFont = async (args) => (await gs.schedulerInner.addJob('optimizeFont', args)); + static optimizeFont = async (args) => { + await gs.getGeneralScheduler(); + return gs.schedulerInner.addJob('optimizeFont', args); + }; /** * @template {Partial} TO diff --git a/js/global.d.ts b/js/global.d.ts index 89c2dd1..ef03c94 100644 --- a/js/global.d.ts +++ b/js/global.d.ts @@ -13,7 +13,7 @@ declare global { // Strings representing supported sources of text. // `stext` indicates the text was extracted directly from a PDF using mupdf. - type TextSource = null | 'tesseract' | 'textract' | 'abbyy' | 'stext' | 'hocr' | 'text'; + type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text'; type FontState = { enableOpt: boolean; @@ -96,6 +96,26 @@ declare global { // Image objects type ImageWrapper = import("./objects/imageObjects.js").ImageWrapper; + /** + * Information from the IHDR chunk of a PNG file. + */ + type PngIHDRInfo = { + /** Image width in pixels. */ + width: number; + /** Image height in pixels. */ + height: number; + /** Bits per sample or per palette index. */ + bitDepth: number; + /** Color type (e.g., grayscale, RGB, palette). */ + colorType: number; + /** Compression method (always 0 for PNG). */ + compressionMethod: number; + /** Filter method (always 0 for PNG). */ + filterMethod: number; + /** Interlace method (0 for none, 1 for Adam7). */ + interlaceMethod: number; + }; + type dims = { height: number; width: number; @@ -204,6 +224,22 @@ declare global { Y: number; } + type PdfFontInfo = { + type: number; + index: number; + name: string; + objN: number; + opentype: opentypeFont; + }; + + type PdfFontFamily = { + normal?: PdfFontInfo; + italic?: PdfFontInfo; + bold?: PdfFontInfo; + boldItalic?: PdfFontInfo; + [style: string]: PdfFontInfo | undefined; + }; + interface TextractGeometry { BoundingBox: TextractBoundingBox; Polygon: TextractPoint[]; @@ -226,6 +262,73 @@ declare global { Relationships?: Relationship[]; } + // Google Vision types + interface GoogleVisionVertex { + x: number; + y: number; + } + + interface GoogleVisionBoundingPoly { + vertices: GoogleVisionVertex[]; + normalizedVertices: GoogleVisionVertex[]; + } + + interface GoogleVisionDetectedLanguage { + languageCode: string; + confidence: number; + } + + interface GoogleVisionDetectedBreak { + type: 'UNKNOWN' | 'SPACE' | 'SURE_SPACE' | 'EOL_SURE_SPACE' | 'HYPHEN' | 'LINE_BREAK'; + isPrefix: boolean; + } + + interface GoogleVisionTextProperty { + detectedLanguages: GoogleVisionDetectedLanguage[]; + detectedBreak?: GoogleVisionDetectedBreak; + } + + interface GoogleVisionSymbol { + property?: GoogleVisionTextProperty; + boundingBox: GoogleVisionBoundingPoly; + text: string; + confidence: number; + } + + interface GoogleVisionWord { + property?: GoogleVisionTextProperty; + boundingBox: GoogleVisionBoundingPoly; + symbols: GoogleVisionSymbol[]; + confidence: number; + } + + interface GoogleVisionParagraph { + property?: GoogleVisionTextProperty; + boundingBox: GoogleVisionBoundingPoly; + words: GoogleVisionWord[]; + confidence: number; + } + + interface GoogleVisionBlock { + property?: GoogleVisionTextProperty; + boundingBox: GoogleVisionBoundingPoly; + paragraphs: GoogleVisionParagraph[]; + blockType: 'UNKNOWN' | 'TEXT' | 'TABLE' | 'PICTURE' | 'RULER' | 'BARCODE'; + confidence: number; + } + + interface GoogleVisionPage { + property?: GoogleVisionTextProperty; + width: number; + height: number; + blocks: GoogleVisionBlock[]; + confidence: number; + } + + interface GoogleVisionFullTextAnnotation { + pages: GoogleVisionPage[]; + text: string; + } } diff --git a/js/import/convertDocTextract.js b/js/import/convertDocTextract.js index a6b2536..d651a22 100644 --- a/js/import/convertDocTextract.js +++ b/js/import/convertDocTextract.js @@ -52,7 +52,6 @@ export async function convertDocTextract({ ocrStr, pageDims }) { const pageObj = new ocr.OcrPage(n, pageDimsN); - // Check if we have any text content const lineBlocks = blocks.filter((block) => block.BlockType === 'LINE' && (!block.Page && n === 0 || block.Page === n + 1)); if (lineBlocks.length === 0) { const warn = { char: 'char_error' }; @@ -175,7 +174,6 @@ function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, page const baseline = [baselineSlope, 0]; const lineObj = new ocr.OcrLine(pageObj, bboxLine, baseline); - const wordPolyArr = /** @type {Polygon[]} */ ([]); const childIds = relationshipMap.get(lineBlock.Id) || []; @@ -187,11 +185,12 @@ function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, page const bboxWord = convertBoundingBox(wordBlock.Geometry.BoundingBox, pageDims); const id = `word_${pageNum + 1}_${lineIndex + 1}_${wordIndex + 1}`; - const wordObj = new ocr.OcrWord(lineObj, wordBlock.Text, bboxWord, id); + const poly = convertPolygon(wordBlock.Geometry.Polygon, pageDims); + + const wordObj = new ocr.OcrWord(lineObj, id, wordBlock.Text, bboxWord, poly); wordObj.conf = wordBlock.Confidence || 100; lineObj.words.push(wordObj); - wordPolyArr.push(convertPolygon(wordBlock.Geometry.Polygon, pageDims)); } }); @@ -207,37 +206,32 @@ function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, page const ascOnlyWords = /** @type {OcrWord[]} */([]); const ascOnlyWordsPoly = /** @type {Polygon[]} */([]); const descOnlyWords = /** @type {OcrWord[]} */([]); - const ascDescWords = /** @type {OcrWord[]} */([]); for (let i = 0; i < lineObj.words.length; i++) { const word = lineObj.words[i]; - const polyWord = wordPolyArr[i]; - if (word.text && descCharRegex.test(word.text)) { + if (descCharRegex.test(word.text)) { descWords.push(word); } - if (word.text && (xCharRegex.test(word.text) || ascCharRegex.test(word.text))) { + if (!descCharRegex.test(word.text) && (xCharRegex.test(word.text) || ascCharRegex.test(word.text))) { nonDescWords.push(word); - nonDescWordsPoly.push(polyWord); + nonDescWordsPoly.push(word.poly); } // The `ascCharRegex` array purposefully does not contain `f`, as it varies wildly in height, // and this array was primarily created for formats where we have character-level data. // Therefore, additional characters are added here as appropriate. - if (word.text && xCharRegex.test(word.text) && !ascCharRegex.test(word.text) + if (xCharRegex.test(word.text) && !ascCharRegex.test(word.text) && !descCharRegex.test(word.text) && !/[fi]/.test(word.text)) { xOnlyWords.push(word); - xOnlyWordsPoly.push(polyWord); + xOnlyWordsPoly.push(word.poly); } - if (word.text && ascCharRegex.test(word.text) && !descCharRegex.test(word.text)) { + if (ascCharRegex.test(word.text) && !descCharRegex.test(word.text)) { ascOnlyWords.push(word); - ascOnlyWordsPoly.push(polyWord); + ascOnlyWordsPoly.push(word.poly); } - if (word.text && descCharRegex.test(word.text) && !ascCharRegex.test(word.text)) { + if (descCharRegex.test(word.text) && !ascCharRegex.test(word.text)) { descOnlyWords.push(word); } - if (word.text && ascCharRegex.test(word.text) && descCharRegex.test(word.text)) { - ascDescWords.push(word); - } // Replace unicode superscript characters with regular text. // TODO: This should be updated to properly handle superscripts rather than removing them. @@ -267,7 +261,7 @@ function convertLineTextract(lineBlock, blockMap, relationshipMap, pageObj, page if (Number.isFinite(nonDescBottomDelta) && nonDescBottomDelta < lineObj.bbox.bottom && nonDescBottomDelta > (lineHeight / 2)) { lineObj.baseline[1] = nonDescBottomDelta - (lineObj.bbox.bottom - polyLine.bl.y); } else if (descWords.length > 0) { - lineObj.baseline[1] = -lineHeight / 3 - (lineObj.bbox.bottom - polyLine.bl.y); + lineObj.baseline[1] = lineHeight / 3 - (lineObj.bbox.bottom - polyLine.bl.y); } // TODO: Properly process metrics when these are negative. @@ -346,7 +340,6 @@ function createParagraphsFromLayout(pageObj, layoutBlocks, relationshipMap, bloc } }); - // Create paragraph if we have lines if (paragraphLines.length > 0) { // Calculate paragraph bounding box from line bounding boxes const parBbox = calcBboxUnion(paragraphLines.map((line) => line.bbox)); @@ -355,7 +348,6 @@ function createParagraphsFromLayout(pageObj, layoutBlocks, relationshipMap, bloc // Set the layout block type as a reason for debugging parObj.reason = layoutBlock.BlockType || 'LAYOUT_UNKNOWN'; - // Assign lines to paragraph paragraphLines.forEach((lineObj) => { lineObj.par = parObj; }); diff --git a/js/import/convertPageAbbyy.js b/js/import/convertPageAbbyy.js index c07f06a..75c9933 100644 --- a/js/import/convertPageAbbyy.js +++ b/js/import/convertPageAbbyy.js @@ -95,8 +95,6 @@ export async function convertPageAbbyy({ ocrStr, n }) { const parLineArr = []; function convertLineAbbyy(xmlLine) { - const stylesLine = {}; - // Unlike Tesseract HOCR, Abbyy XML does not provide accurate metrics for determining font size, so they are calculated here. // Strangely, while Abbyy XML does provide a "baseline" attribute, it is often wildly incorrect (sometimes falling outside of the bounding box entirely). // One guess as to why is that coordinates calculated pre-dewarping are used along with a baseline calculated post-dewarping. @@ -388,7 +386,7 @@ export async function convertPageAbbyy({ ocrStr, n }) { const id = `word_${n + 1}_${pageObj.lines.length + 1}_${i + 1}`; - const wordObj = new ocr.OcrWord(lineObj, text[i], bboxWord, id); + const wordObj = new ocr.OcrWord(lineObj, id, text[i], bboxWord); wordObj.chars = charObjArrLine[i]; wordObj.conf = wordSusp[i] ? 0 : 100; diff --git a/js/import/convertPageBlocks.js b/js/import/convertPageBlocks.js index 472d7af..0d5a24a 100644 --- a/js/import/convertPageBlocks.js +++ b/js/import/convertPageBlocks.js @@ -71,8 +71,13 @@ export async function convertPageBlocks({ baseline[1] = 0; } - const ascHeight = line.rowAttributes.row_height - line.rowAttributes.descenders; - const xHeight = line.rowAttributes.row_height - line.rowAttributes.descenders - line.rowAttributes.ascenders; + // Tesseract.js lists `row_height` instead of `rowHeight` in the types file, which is wrong. + // This has been fixed in the upstream, so the `ts-ignore` comments + // can be removed once Tesseract.js is updated in our dependencies. + // @ts-ignore + const ascHeight = line.rowAttributes.rowHeight - line.rowAttributes.descenders; + // @ts-ignore + const xHeight = line.rowAttributes.rowHeight - line.rowAttributes.descenders - line.rowAttributes.ascenders; const lineObj = new ocr.OcrLine(pageObj, linebox, baseline, ascHeight, xHeight); lineObj.par = parObj; @@ -115,7 +120,7 @@ export async function convertPageBlocks({ left: symbol.bbox.x0, top: symbol.bbox.y0, right: symbol.bbox.x1, bottom: symbol.bbox.y1, }; - const wordObj = new ocr.OcrWord(lineObj, symbol.text, symbolbox, `${id}_${m}`); + const wordObj = new ocr.OcrWord(lineObj, `${id}_${m}`, symbol.text, symbolbox); wordObj.conf = symbol.confidence; wordObj.lang = wordLang; @@ -124,7 +129,7 @@ export async function convertPageBlocks({ continue; } - const wordObj = new ocr.OcrWord(lineObj, word.text.trim(), wordbox, id); + const wordObj = new ocr.OcrWord(lineObj, id, word.text.trim(), wordbox); wordObj.lang = word.language; wordObj.conf = word.confidence; diff --git a/js/import/convertPageGoogleVision.js b/js/import/convertPageGoogleVision.js new file mode 100644 index 0000000..8655435 --- /dev/null +++ b/js/import/convertPageGoogleVision.js @@ -0,0 +1,204 @@ +import ocr from '../objects/ocrObjects.js'; + +import { + calcBboxUnion, + mean50, + descCharArr, + ascCharArr, + xCharArr, + removeSuperscript, +} from '../utils/miscUtils.js'; + +import { + LayoutDataTablePage, +} from '../objects/layoutObjects.js'; +import { pass3 } from './convertPageShared.js'; + +/** + * @param {Object} params + * @param {string} params.ocrStr - String or array of strings containing Google Vision JSON data. + * @param {number} params.n + */ +export async function convertPageGoogleVision({ ocrStr, n }) { + const ocrJson = JSON.parse(ocrStr); + const visionResult = ocrJson.fullTextAnnotation ? ocrJson : ocrJson?.[0]; + if (!visionResult || !visionResult.fullTextAnnotation) { + throw new Error('Failed to parse Google Vision OCR data.'); + } + + const pageVision = /** @type {GoogleVisionPage} */ (visionResult.fullTextAnnotation?.pages?.[0]); + const pageWidth = pageVision.width; + const pageHeight = pageVision.height; + if (!pageWidth || !pageHeight) { + throw new Error('Failed to parse page dimensions.'); + } + + const pageDims = { width: pageWidth, height: pageHeight }; + + const pageObj = new ocr.OcrPage(n, pageDims); + + if (!pageVision.blocks || pageVision.blocks.length === 0) { + const warn = { char: 'char_error' }; + return { + pageObj, + charMetricsObj: {}, + dataTables: new LayoutDataTablePage(n), + warn, + }; + } + + const tablesPage = new LayoutDataTablePage(n); + + /** @type {Array} */ + const angleRisePage = []; + + pageVision.blocks.forEach((block, blockIndex) => { + if (!block.paragraphs) return; + + block.paragraphs.forEach((paragraph, paragraphIndex) => { + const wordsVision = paragraph.words; + if (!wordsVision || wordsVision.length === 0) return; + + const xsPar = paragraph.boundingBox.vertices.map((v) => v.x || 0); + const ysPar = paragraph.boundingBox.vertices.map((v) => v.y || 0); + + const bboxPar = { + left: Math.min(...xsPar), + top: Math.min(...ysPar), + right: Math.max(...xsPar), + bottom: Math.max(...ysPar), + }; + + const parObj = new ocr.OcrPar(pageObj, bboxPar); + parObj.reason = String(block.blockType || 'TEXT'); + + let lineObj = new ocr.OcrLine(pageObj, null, [0, 0]); + let lineIndex = 0; + + wordsVision.forEach((word, wordIndex) => { + if (!word.symbols || word.symbols.length === 0) return; + + const xs = word.boundingBox.vertices.map((v) => v.x || 0); + const ys = word.boundingBox.vertices.map((v) => v.y || 0); + + const bboxWord = { + left: Math.min(...xs), + top: Math.min(...ys), + right: Math.max(...xs), + bottom: Math.max(...ys), + }; + + const id = `word_${n + 1}_${blockIndex + 1}_${paragraphIndex + 1}_${lineIndex + 1}_${wordIndex + 1}`; + + const wordText = word.symbols.map((symbol) => symbol.text || '').join(''); + + const incChars = false; + let charObjs = /** @type {?OcrChar[]} */ (null); + if (incChars) { + charObjs = []; + if (word.symbols) { + word.symbols.forEach((symbol) => { + const charXs = symbol.boundingBox.vertices.map((v) => v.x || 0); + const charYs = symbol.boundingBox.vertices.map((v) => v.y || 0); + const charBbox = { + left: Math.min(...charXs), + top: Math.min(...charYs), + right: Math.max(...charXs), + bottom: Math.max(...charYs), + }; + const charObj = new ocr.OcrChar(symbol.text || '', charBbox); + charObjs.push(charObj); + }); + } + } + + const wordObj = new ocr.OcrWord(lineObj, id, wordText, bboxWord); + wordObj.conf = (word.confidence || 0) * 100; + wordObj.chars = charObjs; + lineObj.words.push(wordObj); + + const hasLineBreak = word.symbols.some((symbol) => { + const breakType = symbol.property?.detectedBreak?.type; + return breakType === 'LINE_BREAK' || breakType === 'EOL_SURE_SPACE'; + }); + + if (hasLineBreak || wordIndex === wordsVision.length - 1) { + if (lineObj.words.length > 0) { + const wordBboxes = lineObj.words.map((w) => w.bbox); + lineObj.bbox = calcBboxUnion(wordBboxes); + + calculateTextMetrics(lineObj); + + pageObj.lines.push(lineObj); + parObj.lines.push(lineObj); + lineObj.par = parObj; + lineIndex++; + } + + if (wordIndex !== wordsVision.length - 1) { + lineObj = new ocr.OcrLine(pageObj, null, [0, 0]); + } + } + }); + + if (parObj.lines.length > 0) { + pageObj.pars.push(parObj); + } + }); + }); + + pageObj.lines.forEach((line) => { + const wordBoxArr = line.words.map((x) => x.bbox); + line.bbox = calcBboxUnion(wordBoxArr); + }); + + const angleRiseMedian = mean50(angleRisePage) || 0; + const angleOut = Math.asin(angleRiseMedian) * (180 / Math.PI); + pageObj.angle = angleOut; + pageObj.textSource = 'google_vision'; + + const langSet = pass3(pageObj); + + return { pageObj, dataTables: tablesPage, langSet }; +} + +/** + * Calculate text metrics for the line based on character types + * @param {OcrLine} lineObj - The line object to update + */ +function calculateTextMetrics(lineObj) { + const descCharRegex = new RegExp(`[${descCharArr.join('')}]`); + const ascCharRegex = new RegExp(`[${ascCharArr.join('')}]`); + const xCharRegex = new RegExp(`[${xCharArr.join('')}]`); + + const xOnlyWords = /** @type {OcrWord[]} */([]); + const ascOnlyWords = /** @type {OcrWord[]} */([]); + + lineObj.words.forEach((word) => { + if (word.text && xCharRegex.test(word.text) && !ascCharRegex.test(word.text) + && !descCharRegex.test(word.text) && !/[fi]/.test(word.text)) { + xOnlyWords.push(word); + } + if (word.text && ascCharRegex.test(word.text) && !descCharRegex.test(word.text)) { + ascOnlyWords.push(word); + } + }); + + let xHeight = null; + if (xOnlyWords.length > 0) { + xHeight = mean50(xOnlyWords.map((word) => word.bbox.bottom - word.bbox.top)); + } + + const ascHeight = ascOnlyWords.length > 0 + ? mean50(ascOnlyWords.map((word) => word.bbox.bottom - word.bbox.top)) + : null; + + if (xHeight && ascHeight && xHeight > ascHeight * 0.8) { + if (ascOnlyWords.length > xOnlyWords.length) { + xHeight = null; + } + } + + if (xHeight && xHeight > 0) lineObj.xHeight = xHeight; + if (ascHeight && ascHeight > 0) lineObj.ascHeight = ascHeight; +} diff --git a/js/import/convertPageHocr.js b/js/import/convertPageHocr.js index 86cb75d..dbb7f1a 100644 --- a/js/import/convertPageHocr.js +++ b/js/import/convertPageHocr.js @@ -211,7 +211,7 @@ export async function convertPageHocr({ // Tesseract LSTM already does this, however Tesseract Legacy combines entire lines into the same "word", // which makes good alignment impossible. if (wordLang === 'chi_sim') { - const wordObj = new ocr.OcrWord(lineObj, contentStrLetter, bbox, `${wordID}_${j}`); + const wordObj = new ocr.OcrWord(lineObj, `${wordID}_${j}`, contentStrLetter, bbox); wordObj.conf = wordConf; wordObj.lang = wordLang; @@ -240,7 +240,7 @@ export async function convertPageHocr({ bottom: Math.max(...bboxesCore.map((x) => x[3])), }; - const wordObj = new ocr.OcrWord(lineObj, text, wordBoxCore, wordID); + const wordObj = new ocr.OcrWord(lineObj, wordID, text, wordBoxCore); wordObj.lang = wordLang; wordObj.chars = charObjArr; @@ -305,7 +305,7 @@ export async function convertPageHocr({ const confMatch = titleStrWord.match(/(?:;|\s)x_wconf\s+(\d+)/)?.[1] || '0'; const wordConf = parseInt(confMatch) || 0; - const wordObj = new ocr.OcrWord(lineObj, wordText, wordBox, wordID); + const wordObj = new ocr.OcrWord(lineObj, wordID, wordText, wordBox); wordObj.lang = wordLang; // Font size is only respected if this is a re-import, as if an ocrWord object has `size` set, it will be used over line metrics. diff --git a/js/import/convertPageStext.js b/js/import/convertPageStext.js index 4151a63..3cc235d 100644 --- a/js/import/convertPageStext.js +++ b/js/import/convertPageStext.js @@ -534,7 +534,7 @@ export async function convertPageStext({ ocrStr, n }) { // Tesseract LSTM already does this, however Tesseract Legacy combines entire lines into the same "word", // which makes good alignment impossible. if (wordLang === 'chi_sim') { - const wordObj = new ocr.OcrWord(lineObj, letter, bbox, `${wordID}_${j}`); + const wordObj = new ocr.OcrWord(lineObj, `${wordID}_${j}`, letter, bbox); wordObj.conf = 100; wordObj.lang = wordLang; wordObj.visualCoords = false; @@ -560,7 +560,7 @@ export async function convertPageStext({ ocrStr, n }) { if (bbox.left < 0 && bbox.right < 0) continue; - const wordObj = new ocr.OcrWord(lineObj, wordText, bbox, wordID); + const wordObj = new ocr.OcrWord(lineObj, wordID, wordText, bbox); wordObj.style.size = fontSizeArr[i]; wordObj.lang = wordLang; diff --git a/js/import/convertPageText.js b/js/import/convertPageText.js index bc3f753..3d735ec 100644 --- a/js/import/convertPageText.js +++ b/js/import/convertPageText.js @@ -216,7 +216,7 @@ export async function convertPageText({ textStr, pageDims = null }) { bottom: lineBottom, }; const wordId = `word_${pageIndex + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`; - const wordObj = new ocr.OcrWord(lineObj, tok.text, wordBbox, wordId); + const wordObj = new ocr.OcrWord(lineObj, wordId, tok.text, wordBbox); wordObj.conf = 100; wordObj.style.font = FONT_FAMILY; lineObj.words.push(wordObj); @@ -239,7 +239,7 @@ export async function convertPageText({ textStr, pageDims = null }) { bottom: lineBottom, }; const wordId = `word_${pageIndex + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`; - const wordObj = new ocr.OcrWord(lineObj, nextTok.text, wordBbox, wordId); + const wordObj = new ocr.OcrWord(lineObj, wordId, nextTok.text, wordBbox); wordObj.conf = 100; wordObj.style.font = FONT_FAMILY; lineObj.words.push(wordObj); diff --git a/js/import/import.js b/js/import/import.js index 74ee989..e9c6696 100644 --- a/js/import/import.js +++ b/js/import/import.js @@ -6,10 +6,10 @@ import { layoutRegions, ocrAll, ocrAllRaw, - pageMetricsArr, + pageMetricsAll, } from '../containers/dataContainer.js'; import { FontCont } from '../containers/fontContainer.js'; -import { ImageCache, ImageWrapper } from '../containers/imageContainer.js'; +import { ImageCache } from '../containers/imageContainer.js'; import { extractInternalPDFText } from '../extractPDFText.js'; import { enableFontOpt, @@ -20,7 +20,7 @@ import { runFontOptimization } from '../fontEval.js'; import { calcCharMetricsFromPages } from '../fontStatistics.js'; import { calcSuppFontInfo } from '../fontSupp.js'; import { gs } from '../generalWorkerMain.js'; -import { imageUtils } from '../objects/imageObjects.js'; +import { imageUtils, ImageWrapper } from '../objects/imageObjects.js'; import { addCircularRefsDataTables, LayoutDataTablePage, LayoutPage } from '../objects/layoutObjects.js'; import { addCircularRefsOcr } from '../objects/ocrObjects.js'; import { PageMetrics } from '../objects/pageMetricsObjects.js'; @@ -285,9 +285,9 @@ export async function importFiles(files) { for (let i = 0; i < ocrAll[oemName].length; i++) { inputData.xmlMode[i] = true; if (ocrAll[oemName][i].dims.height && ocrAll[oemName][i].dims.width) { - pageMetricsArr[i] = new PageMetrics(ocrAll[oemName][i].dims); + pageMetricsAll[i] = new PageMetrics(ocrAll[oemName][i].dims); } - pageMetricsArr[i].angle = ocrAll[oemName][i].angle; + pageMetricsAll[i].angle = ocrAll[oemName][i].angle; } } @@ -295,10 +295,9 @@ export async function importFiles(files) { let pageCount; let pageCountImage; - let abbyyMode = false; - let textractMode = false; + /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ + let format; let reimportHocrMode = false; - let textMode = false; if (inputData.pdfMode) { const pdfFile = pdfFiles[0]; @@ -319,7 +318,6 @@ export async function importFiles(files) { let existingOpt = false; const oemName = 'User Upload'; - let stextMode; if (xmlModeImport) { // Initialize a new array on `ocrAll` if one does not already exist if (!ocrAll[oemName]) ocrAll[oemName] = Array(inputData.pageCount); @@ -327,6 +325,8 @@ export async function importFiles(files) { const ocrData = await importOCRFiles(Array.from(ocrFiles)); + format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format); + ocrAllRaw.active = ocrData.hocrRaw; // Subset OCR data to avoid uncaught error that occurs when there are more pages of OCR data than image data. // While this should be rare, it appears to be fairly common with Archive.org documents. @@ -379,19 +379,15 @@ export async function importFiles(files) { existingLayoutDataTable = true; } - abbyyMode = ocrData.abbyyMode; + format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format); reimportHocrMode = ocrData.reimportHocrMode; - - stextMode = ocrData.stextMode; - textractMode = ocrData.textractMode; - textMode = ocrData.textMode; } let pageCountOcr = ocrAllRaw.active?.length || ocrAll.active?.length || 0; // For Textract, `ocrAllRaw.active[0]` is a string containing the Textract JSON data for all pages. // This ad-hoc solution counts the number of "PAGE" blocks in the Textract JSON data. - if (textractMode && ocrAllRaw.active?.length) { + if (format === 'textract' && ocrAllRaw.active?.length) { pageCountOcr = ocrAllRaw.active[0].match(/"BLOCKTYPE":\s*"PAGE"/ig)?.length || pageCountOcr; } @@ -430,7 +426,7 @@ export async function importFiles(files) { ImageCache.nativeSrc[i] = await importImageFileToBase64(imageFiles[i]).then(async (imgStr) => { const imgWrapper = new ImageWrapper(i, imgStr, 'native', false, false); const imageDims = await imageUtils.getDims(imgWrapper); - pageMetricsArr[i] = new PageMetrics(imageDims); + pageMetricsAll[i] = new PageMetrics(imageDims); return imgWrapper; }); ImageCache.loadCount++; @@ -439,17 +435,11 @@ export async function importFiles(files) { } if (xmlModeImport) { - /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ - let format = 'hocr'; - if (abbyyMode) format = 'abbyy'; - if (stextMode) format = 'stext'; - if (textractMode) format = 'textract'; - if (textMode) format = 'text'; - - // Process HOCR using web worker, reading from file first if that has not been done already - await convertOCR(ocrAllRaw.active, true, format, oemName, reimportHocrMode, pageMetricsArr).then(async () => { - // Skip this step if optimization info was already restored from a previous session, or if using stext (which is character-level but not visually accurate). - if (!existingOpt && !stextMode) { + // Process OCR using web worker, reading from file first if that has not been done already + await convertOCR(ocrAllRaw.active, true, format, oemName, reimportHocrMode, pageMetricsAll).then(async () => { + // Skip this step if optimization info was already restored from a previous session, + // or if using stext/textract (which are character-level but not visually accurate). + if (!existingOpt && !['stext', 'textract'].includes(format)) { await checkCharWarn(convertPageWarn); const charMetrics = calcCharMetricsFromPages(ocrAll.active); @@ -498,12 +488,7 @@ export async function importFilesSupp(files, ocrName) { opt.warningHandler(warningHTML); } - /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ - let format = 'hocr'; - if (ocrData.abbyyMode) format = 'abbyy'; - if (ocrData.stextMode) format = 'stext'; - if (ocrData.textractMode) format = 'textract'; - if (ocrData.textMode) format = 'text'; + const format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format); await convertOCR(ocrData.hocrRaw, false, format, ocrName, ocrData.reimportHocrMode); } diff --git a/js/import/importOCR.js b/js/import/importOCR.js index 4c9272b..f83881d 100644 --- a/js/import/importOCR.js +++ b/js/import/importOCR.js @@ -8,6 +8,48 @@ export const splitHOCRStr = (hocrStrAll) => hocrStrAll.replace(/[\s\S]*?/, .trim() .split(/(?=
{ + if (ext) { + ext = ext.replace(/^\./, '').toLowerCase(); + if (ext === 'hocr') { + return 'hocr'; + } if (ext === 'stext') { + return 'stext'; + } + } + + // Check whether input is Abbyy XML + // TODO: The auto-detection of formats needs to be more robust. + // At present, any string that contains ">" and "abbyy" is considered Abbyy XML. + const node2 = ocrStr.match(/>([^>]+)/)?.[1]; + + if (!!node2 && !!/abbyy/i.test(node2)) { + return 'abbyy'; + } if (!!node2 && !!/" and "abbyy" is considered Abbyy XML. - const node2 = hocrStrAll.match(/>([^>]+)/)?.[1]; - abbyyMode = !!node2 && !!/abbyy/i.test(node2); - stextMode = !!node2 && !!//)?.[0]; hocrRaw = splitHOCRStr(hocrStrAll); - } else { - console.error(ocrFilesAll[0]); - throw new Error('No supported OCR format detected.'); } pageCountHOCR = hocrRaw.length; @@ -78,9 +110,13 @@ export async function importOCRFiles(ocrFilesAll) { // Check whether input is Abbyy XML using the first file const hocrStrFirst = await readOcrFile(ocrFilesAll[0]); - const node2 = hocrStrFirst.match(/>([^>]+)/)?.[1]; - abbyyMode = !!node2 && !!/abbyy/i.test(node2); - textractMode = !abbyyMode && !!/"AnalyzeDocumentModelVersion"/i.test(hocrStrFirst); + + format = detectOcrFormat(hocrStrFirst); + + if (!format) { + console.error(ocrFilesAll[0]); + throw new Error('No supported OCR format detected.'); + } for (let i = 0; i < pageCountHOCR; i++) { const hocrFile = ocrFilesAll[i]; @@ -88,7 +124,7 @@ export async function importOCRFiles(ocrFilesAll) { } } - if (hocrMode && hocrStrStart) { + if (format === 'hocr' && hocrStrStart) { const getMeta = (name) => { const regex = new RegExp(` { ocrAll.active[i] = res.data.page; improveCt += res.data.improveCt; diff --git a/js/objects/imageObjects.js b/js/objects/imageObjects.js index 4b782fc..44f6846 100644 --- a/js/objects/imageObjects.js +++ b/js/objects/imageObjects.js @@ -14,8 +14,9 @@ export class ImageWrapper { constructor(n, imageStr, colorMode, rotated = false, upscaled = false) { this.n = n; this.src = imageStr; - const format0 = imageStr.match(/^data:image\/(png|jpeg)/)?.[1]; + const format0 = /** @type {'png'|'jpeg'|undefined} */ (imageStr.match(/^data:image\/(png|jpeg)/)?.[1]); if (!format0 || !['png', 'jpeg'].includes(format0)) throw new Error(`Invalid image format: ${format0}`); + /** @type {'png'|'jpeg'} */ this.format = format0; this._dims = null; this.rotated = rotated; @@ -28,7 +29,7 @@ export class ImageWrapper { /** * - * @param {import('../containers/imageContainer.js').ImageWrapper} img + * @param {ImageWrapper} img * @returns {dims} */ const getDims = (img) => { diff --git a/js/objects/ocrObjects.js b/js/objects/ocrObjects.js index 2102c7f..7363aaa 100644 --- a/js/objects/ocrObjects.js +++ b/js/objects/ocrObjects.js @@ -92,11 +92,12 @@ export function OcrLine(page, bbox, baseline, ascHeight = null, xHeight = null) /** * @param {OcrLine} line + * @param {string} id * @param {string} text * @param {bbox} bbox - * @param {string} id + * @param {Polygon} [poly] */ -export function OcrWord(line, text, bbox, id) { +export function OcrWord(line, id, text, bbox, poly) { /** @type {string} */ this.text = text; /** @type {?string} */ @@ -118,6 +119,8 @@ export function OcrWord(line, text, bbox, id) { this.conf = 0; /** @type {bbox} */ this.bbox = bbox; + /** @type {?Polygon} */ + this.poly = poly || null; /** @type {boolean} */ this.compTruth = false; /** @type {boolean} */ @@ -618,7 +621,8 @@ function cloneLine(line) { * @param {OcrWord} word */ function cloneWord(word) { - const wordNew = new OcrWord(word.line, word.text, { ...word.bbox }, word.id); + const wordNew = new OcrWord(word.line, word.id, word.text, { ...word.bbox }); + if (word.poly) wordNew.poly = { ...word.poly }; wordNew.conf = word.conf; wordNew.style = { ...word.style }; wordNew.lang = word.lang; diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js index 9232e98..d0d0fda 100644 --- a/js/recognizeConvert.js +++ b/js/recognizeConvert.js @@ -3,14 +3,15 @@ import { inputData, opt } from './containers/app.js'; import { convertPageWarn, DebugData, - layoutDataTables, layoutRegions, ocrAll, pageMetricsArr, visInstructions, + layoutDataTables, layoutRegions, ocrAll, pageMetricsAll, visInstructions, } from './containers/dataContainer.js'; import { FontCont } from './containers/fontContainer.js'; -import { ImageCache, ImageWrapper } from './containers/imageContainer.js'; +import { ImageCache } from './containers/imageContainer.js'; import { loadBuiltInFontsRaw, loadChiSimFont } from './fontContainerMain.js'; import { runFontOptimization } from './fontEval.js'; import { calcCharMetricsFromPages } from './fontStatistics.js'; import { gs } from './generalWorkerMain.js'; +import { ImageWrapper } from './objects/imageObjects.js'; import { LayoutDataTablePage, LayoutPage } from './objects/layoutObjects.js'; import { PageMetrics } from './objects/pageMetricsObjects.js'; import { clearObjectProperties } from './utils/miscUtils.js'; @@ -38,7 +39,7 @@ export const compareOCRPage = async (pageA, pageB, options) => { const binaryImage = skipImage ? null : await ImageCache.getBinary(pageA.n); - const pageMetricsObj = pageMetricsArr[pageA.n]; + const pageMetricsObj = pageMetricsAll[pageA.n]; return gs.compareOCRPageImp({ pageA, pageB, binaryImage, pageMetricsObj, options, }); @@ -53,7 +54,7 @@ export const compareOCRPage = async (pageA, pageB, options) => { export const evalOCRPage = async (params) => { const n = 'page' in params.page ? params.page.page.n : params.page.n; const binaryImage = await ImageCache.getBinary(n); - const pageMetricsObj = pageMetricsArr[n]; + const pageMetricsObj = pageMetricsAll[n]; return gs.evalPageBase({ page: params.page, binaryImage, pageMetricsObj, func: params.func, view: params.view, }); @@ -119,7 +120,7 @@ export const calcRecognizeRotateArgs = async (n, areaMode) => { // Threshold (in radians) under which page angle is considered to be effectively 0. const angleThresh = 0.0008726646; - const angle = pageMetricsArr[n]?.angle; + const angle = pageMetricsAll[n]?.angle; // Whether the page angle is already known (or needs to be detected) const angleKnown = typeof (angle) === 'number'; @@ -178,7 +179,7 @@ export const recognizePageImp = async (n, legacy, lstm, areaMode, tessOptions = ...tessOptions, }; - const pageDims = pageMetricsArr[n].dims; + const pageDims = pageMetricsAll[n].dims; // If `legacy` and `lstm` are both `false`, recognition is not run, but layout analysis is. // This combination of options would be set for debug mode, where the point of running Tesseract @@ -201,7 +202,7 @@ export const recognizePageImp = async (n, legacy, lstm, areaMode, tessOptions = debugVis, }, n, - knownAngle: pageMetricsArr[n].angle, + knownAngle: pageMetricsAll[n].angle, pageDims, }); @@ -220,7 +221,7 @@ export const recognizePageImp = async (n, legacy, lstm, areaMode, tessOptions = // parseDebugInfo(res0.recognize.debug); - if (!angleKnown) pageMetricsArr[n].angle = (res0.recognize.rotateRadians || 0) * (180 / Math.PI) * -1; + if (!angleKnown) pageMetricsAll[n].angle = (res0.recognize.rotateRadians || 0) * (180 / Math.PI) * -1; // An image is rotated if either the source was rotated or rotation was applied by Tesseract. const isRotated = Boolean(res0.recognize.rotateRadians || 0) || nativeN.rotated; @@ -290,7 +291,7 @@ export function checkCharWarn(warnArr) { * @param {boolean} mainData - Whether this is the "main" data that document metrics are calculated from. * For imports of user-provided data, the first data provided should be flagged as the "main" data. * For Tesseract.js recognition, the Tesseract Legacy results should be flagged as the "main" data. - * @param {("hocr"|"abbyy"|"stext"|"textract"|"text")} format - Format of raw data. + * @param {("hocr"|"abbyy"|"stext"|"textract"|"google_vision"|"text")} format - Format of raw data. * @param {string} engineName - Name of OCR engine. * @param {boolean} [scribeMode=false] - Whether this is HOCR data from this program. */ @@ -301,8 +302,10 @@ export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, sc res = await gs.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode }); } else if (format === 'abbyy') { res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n }); - // } else if (format === 'textract') { - // res = await gs.convertPageTextract({ ocrStr: ocrRaw, n }); + } else if (format === 'textract') { + // res = await gs.convertPageTextract({ ocrStr: ocrRaw, n }); + } else if (format === 'google_vision') { + res = await gs.convertPageGoogleVision({ ocrStr: ocrRaw, n }); } else if (format === 'stext') { res = await gs.convertPageStext({ ocrStr: ocrRaw, n }); } else if (format === 'text') { @@ -347,9 +350,9 @@ export async function convertPageCallback({ // The main OCR data is always preferred for setting page metrics. // This matters when the user uploads their own data, as the images are expected to be rendered at the same resolution as the OCR data. - if (pageObj.dims.height && pageObj.dims.width) pageMetricsArr[n] = new PageMetrics(pageObj.dims); + if (pageObj.dims.height && pageObj.dims.width) pageMetricsAll[n] = new PageMetrics(pageObj.dims); - pageMetricsArr[n].angle = pageObj.angle; + pageMetricsAll[n].angle = pageObj.angle; } inputData.xmlMode[n] = true; @@ -368,7 +371,7 @@ export async function convertPageCallback({ * @param {boolean} mainData - Whether this is the "main" data that document metrics are calculated from. * For imports of user-provided data, the first data provided should be flagged as the "main" data. * For Tesseract.js recognition, the Tesseract Legacy results should be flagged as the "main" data. - * @param {("hocr"|"abbyy"|"stext"|"textract"|"text")} format - Format of raw data. + * @param {("hocr"|"abbyy"|"stext"|"textract"|"google_vision"|"text")} format - Format of raw data. * @param {string} engineName - Name of OCR engine. * @param {boolean} [scribeMode=false] - Whether this is HOCR data from this program. * @param {?PageMetrics[]} [pageMetrics=null] - Page metrics to use for the pages (Textract only). @@ -376,7 +379,7 @@ export async function convertPageCallback({ export async function convertOCR(ocrRawArr, mainData, format, engineName, scribeMode, pageMetrics = null) { const promiseArr = []; if (format === 'textract') { - if (!pageMetrics) throw new Error('Page metrics must be provided for Textract data.'); + if (!pageMetrics || !pageMetrics[0]?.dims) throw new Error('Page metrics must be provided for Textract data.'); const pageDims = pageMetrics.map((metrics) => (metrics.dims)); const res = await gs.convertDocTextract({ ocrStr: ocrRawArr, pageDims }); for (let n = 0; n < res.length; n++) { diff --git a/js/utils/fontUtils.js b/js/utils/fontUtils.js index 98e07ec..90517ea 100644 --- a/js/utils/fontUtils.js +++ b/js/utils/fontUtils.js @@ -8,6 +8,37 @@ import { FontProps, quantile } from './miscUtils.js'; import opentype from '../../lib/opentype.module.js'; import { opt } from '../containers/app.js'; +/** + * Return an array of all characters used in the provided OCR data. + * Used for subsetting fonts to only the necessary characters. + * @param {Array} ocrPageArr + * @param {string} [family] - Font family to filter by. If empty, all fonts are included. + * @param {string} [style] - Font style to filter by. If empty, all styles are included. + * + */ +export const getDistinctCharsFont = (ocrPageArr, family, style) => { + const charsAll = {}; + for (const ocrPage of ocrPageArr) { + for (const ocrLine of ocrPage.lines) { + for (const ocrWord of ocrLine.words) { + if (family || style) { + const wordFont = FontCont.getWordFont(ocrWord); + if (!wordFont) continue; + if (family && wordFont.family !== family) continue; + // Sometimes the font is 'normal' even when the requested style is 'bold' or 'italic'. + // For example, this currently happens for the Chinese font, which has no bold or italic variants. + // Therefore, as a quick fix for now, only filter by style if the current style is not 'normal'. + if (style && wordFont.style !== style && (wordFont.style !== 'normal')) continue; + } + ocrWord.text.split('').forEach((x) => { + charsAll[x] = true; + }); + } + } + } + return Object.keys(charsAll); +}; + /** * * @param {import('opentype.js').Font} font diff --git a/js/utils/imageUtils.js b/js/utils/imageUtils.js index ded3418..4f6fdb7 100644 --- a/js/utils/imageUtils.js +++ b/js/utils/imageUtils.js @@ -97,6 +97,10 @@ export const importImageFileToBase64 = async (file) => new Promise((resolve, rej * @returns {Uint8Array} The byte array representation of the image data. */ export function base64ToBytes(base64) { + const commaIndex = base64.slice(0, 100).indexOf(','); + if (commaIndex > 0) { + base64 = base64.slice(commaIndex + 1); + } const binaryString = atob(base64); const len = binaryString.length; const bytes = new Uint8Array(len); @@ -106,6 +110,46 @@ export function base64ToBytes(base64) { return bytes; } +/** + * Extracts complete IHDR information from a PNG image encoded in base64. + * + * @param {Uint8Array} bytes - The base64 encoded string of the PNG image. + * @returns {PngIHDRInfo} + */ +export function getPngIHDRInfo(bytes) { + // The IHDR chunk data starts at byte 16 (after PNG signature and IHDR chunk header) + // Width: bytes 16-19 (4 bytes, big-endian) + const width = (bytes[16] << 24) | (bytes[17] << 16) | (bytes[18] << 8) | bytes[19]; + + // Height: bytes 20-23 (4 bytes, big-endian) + const height = (bytes[20] << 24) | (bytes[21] << 16) | (bytes[22] << 8) | bytes[23]; + + // Bit depth: byte 24 (1 byte) + const bitDepth = bytes[24]; + + // Color type: byte 25 (1 byte) + const colorType = bytes[25]; + + // Compression method: byte 26 (1 byte, always 0 for PNG) + const compressionMethod = bytes[26]; + + // Filter method: byte 27 (1 byte, always 0 for PNG) + const filterMethod = bytes[27]; + + // Interlace method: byte 28 (1 byte, 0=none, 1=Adam7) + const interlaceMethod = bytes[28]; + + return { + width, + height, + bitDepth, + colorType, + compressionMethod, + filterMethod, + interlaceMethod, + }; +} + /** * Extracts the width and height from the IHDR chunk of a PNG image encoded in base64. * diff --git a/js/worker/compareOCRModule.js b/js/worker/compareOCRModule.js index 84c5525..998c6dc 100644 --- a/js/worker/compareOCRModule.js +++ b/js/worker/compareOCRModule.js @@ -503,7 +503,7 @@ async function penalizeWord(wordObjs) { * @param {object} params * @param {OcrPage} params.pageA * @param {OcrPage} params.pageB - * @param {import('../containers/imageContainer.js').ImageWrapper} params.binaryImage + * @param {ImageWrapper} params.binaryImage * @param {PageMetrics} params.pageMetricsObj * @param {object} params.options * @param {("stats"|"comb")} [params.options.mode='stats'] - If `mode = 'stats'` stats quantifying the number of matches/mismatches are returned. @@ -1216,7 +1216,7 @@ export async function checkWords(wordsA, binaryImage, imageRotated, pageMetricsO /** * @param {Object} params * @param {OcrPage|OcrLine} params.page - * @param {import('../containers/imageContainer.js').ImageWrapper} params.binaryImage + * @param {ImageWrapper} params.binaryImage * @param {PageMetrics} params.pageMetricsObj * @param {?function} [params.func=null] * @param {boolean} [params.view=false] - Draw results on debugging canvases @@ -1277,7 +1277,7 @@ export async function evalPageBase({ /** * @param {Object} params * @param {OcrPage} params.page - * @param {import('../containers/imageContainer.js').ImageWrapper} params.binaryImage + * @param {ImageWrapper} params.binaryImage * @param {PageMetrics} params.pageMetricsObj * @param {string} params.font * @param {boolean} [params.opt=false] - Whether to use the optimized font set @@ -1338,7 +1338,7 @@ export async function evalPageFont({ /** * @param {Object} params * @param {OcrPage} params.page - * @param {import('../containers/imageContainer.js').ImageWrapper} params.binaryImage + * @param {ImageWrapper} params.binaryImage * @param {PageMetrics} params.pageMetricsObj * @param {function} params.func * @param {boolean} params.view @@ -1410,7 +1410,7 @@ export async function nudgePageBase({ /** * @param {Object} params * @param {OcrPage} params.page - * @param {import('../containers/imageContainer.js').ImageWrapper} params.binaryImage + * @param {ImageWrapper} params.binaryImage * @param {PageMetrics} params.pageMetricsObj * @param {boolean} params.view * @returns @@ -1432,7 +1432,7 @@ export async function nudgePageFontSize({ /** * @param {Object} params * @param {OcrPage} params.page - * @param {import('../containers/imageContainer.js').ImageWrapper} params.binaryImage + * @param {ImageWrapper} params.binaryImage * @param {PageMetrics} params.pageMetricsObj * @param {boolean} params.view * @returns @@ -1454,7 +1454,7 @@ export async function nudgePageBaseline({ * This function is a WIP and not all options are implemented. * @param {Object} args * @param {OcrPage} args.page - Page to render. - * @param {import('../containers/imageContainer.js').ImageWrapper} [args.image] + * @param {ImageWrapper} [args.image] * @param {dims} [args.pageDims] - Dimensions of page. * @param {?number} [args.angle=0] - Angle of page. * @param {("proof" | "invis" | "ebook" | "eval")} [args.displayMode='proof'] - Display mode. diff --git a/js/worker/generalWorker.js b/js/worker/generalWorker.js index 1fb4dbf..9ee67c7 100644 --- a/js/worker/generalWorker.js +++ b/js/worker/generalWorker.js @@ -3,6 +3,7 @@ import { convertPageBlocks } from '../import/convertPageBlocks.js'; import { convertPageHocr } from '../import/convertPageHocr.js'; import { convertPageStext } from '../import/convertPageStext.js'; import { convertDocTextract } from '../import/convertDocTextract.js'; +import { convertPageGoogleVision } from '../import/convertPageGoogleVision.js'; import { convertPageText } from '../import/convertPageText.js'; import { FontCont, loadFontsFromSource } from '../containers/fontContainer.js'; @@ -401,6 +402,7 @@ const handleMessage = async (data) => { convertPageHocr, convertPageStext, convertDocTextract, + convertPageGoogleVision, convertPageBlocks, convertPageText, diff --git a/scribe.js b/scribe.js index 925c86b..943346c 100644 --- a/scribe.js +++ b/scribe.js @@ -4,7 +4,7 @@ import { DebugData, layoutDataTables, layoutRegions, - ocrAll, pageMetricsArr, visInstructions, + ocrAll, pageMetricsAll, visInstructions, } from './js/containers/dataContainer.js'; import { FontCont } from './js/containers/fontContainer.js'; import { ImageCache } from './js/containers/imageContainer.js'; @@ -12,7 +12,7 @@ import coords from './js/coordinates.js'; import { drawDebugImages, renderPageStatic } from './js/debug.js'; import { download, exportData } from './js/export/export.js'; import { convertToCsv, writeDebugCsv } from './js/export/exportDebugCsv.js'; -import { writePdf } from './js/export/writePdf.js'; +import { writePdf } from './js/export/pdf/writePdf.js'; import { writeHocr } from './js/export/writeHocr.js'; import { writeText } from './js/export/writeText.js'; import { extractInternalPDFText } from './js/extractPDFText.js'; @@ -181,7 +181,7 @@ class data { static ocr = ocrAll; - static pageMetrics = pageMetricsArr; + static pageMetrics = pageMetricsAll; static vis = visInstructions; } diff --git a/tests/assets/ascenders_descenders_test.png b/tests/assets/ascenders_descenders_test.png new file mode 100644 index 0000000..2dcb3a0 Binary files /dev/null and b/tests/assets/ascenders_descenders_test.png differ diff --git a/tests/assets/ascenders_descenders_test_Abbyy.xml b/tests/assets/ascenders_descenders_test_Abbyy.xml new file mode 100644 index 0000000..ed39f86 --- /dev/null +++ b/tests/assets/ascenders_descenders_test_Abbyy.xml @@ -0,0 +1,40 @@ + + + + + + + +A +s +c +e +n +d +e +r +s + +O +n + +q +u +e +r +y + +p +n +g + +w +e + +c +a +n + + + + \ No newline at end of file diff --git a/tests/assets/ascenders_descenders_test_AwsTextract.json b/tests/assets/ascenders_descenders_test_AwsTextract.json new file mode 100644 index 0000000..ec659fc --- /dev/null +++ b/tests/assets/ascenders_descenders_test_AwsTextract.json @@ -0,0 +1,375 @@ +{ + "$metadata": { + "httpStatusCode": 200, + "requestId": "d29c88a3-16a4-4b98-9362-1662b33bd854", + "attempts": 1, + "totalRetryDelay": 0 + }, + "Blocks": [ + { + "BlockType": "PAGE", + "Geometry": { + "BoundingBox": { + "Height": 1, + "Left": 0, + "Top": 0, + "Width": 1 + }, + "Polygon": [ + { + "X": 0, + "Y": 3.490975757358683e-7 + }, + { + "X": 1, + "Y": 0 + }, + { + "X": 1, + "Y": 1 + }, + { + "X": 0.0000017131741287812474, + "Y": 1 + } + ] + }, + "Id": "58ba5c76-286d-4d94-a49e-efe83684f00a", + "Relationships": [ + { + "Ids": [ + "0ca4049f-ad02-4033-98f9-cf2a6fd500ca", + "d2d459e3-2ae1-48cf-b428-c8854cc592eb", + "c4f16fde-2387-4d5a-a50a-871c88c748e3" + ], + "Type": "CHILD" + } + ] + }, + { + "BlockType": "LINE", + "Confidence": 99.9402084350586, + "Geometry": { + "BoundingBox": { + "Height": 0.15146490931510925, + "Left": 0.09749367088079453, + "Top": 0.17868299782276154, + "Width": 0.6611259579658508 + }, + "Polygon": [ + { + "X": 0.09749367088079453, + "Y": 0.1792251169681549 + }, + { + "X": 0.7585582137107849, + "Y": 0.17868299782276154 + }, + { + "X": 0.7586196064949036, + "Y": 0.32960548996925354 + }, + { + "X": 0.0975649505853653, + "Y": 0.3301478922367096 + } + ] + }, + "Id": "0ca4049f-ad02-4033-98f9-cf2a6fd500ca", + "Relationships": [ + { + "Ids": [ + "8b70e58e-d61c-426b-8792-3345a907082f", + "a4d9dbdc-4c38-4316-817d-1268e7cb4df8" + ], + "Type": "CHILD" + } + ], + "Text": "Ascenders On" + }, + { + "BlockType": "LINE", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.14423586428165436, + "Left": 0.09872987866401672, + "Top": 0.47797122597694397, + "Width": 0.4705139994621277 + }, + "Polygon": [ + { + "X": 0.09872987866401672, + "Y": 0.4783574342727661 + }, + { + "X": 0.5691826939582825, + "Y": 0.47797122597694397 + }, + { + "X": 0.5692439079284668, + "Y": 0.6218206882476807 + }, + { + "X": 0.09879780560731888, + "Y": 0.6222071051597595 + } + ] + }, + "Id": "d2d459e3-2ae1-48cf-b428-c8854cc592eb", + "Relationships": [ + { + "Ids": [ + "94a97c49-3b20-47c8-a425-3c8c92fa8f90", + "237b5c3c-9561-4a2d-a870-cfa8a4ae7c90" + ], + "Type": "CHILD" + } + ], + "Text": "query png" + }, + { + "BlockType": "LINE", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.10038164258003235, + "Left": 0.09615112841129303, + "Top": 0.7339208722114563, + "Width": 0.33016639947891235 + }, + "Polygon": [ + { + "X": 0.09615112841129303, + "Y": 0.7341921329498291 + }, + { + "X": 0.42627352476119995, + "Y": 0.7339208722114563 + }, + { + "X": 0.4263175427913666, + "Y": 0.8340312242507935 + }, + { + "X": 0.0961984246969223, + "Y": 0.834302544593811 + } + ] + }, + "Id": "c4f16fde-2387-4d5a-a50a-871c88c748e3", + "Relationships": [ + { + "Ids": [ + "d17465ee-b2a6-4fe6-9932-95be7b0fad79", + "ba62d2ca-9a1f-424f-92b3-a22e7880ae74" + ], + "Type": "CHILD" + } + ], + "Text": "we can" + }, + { + "BlockType": "WORD", + "Confidence": 99.88042449951172, + "Geometry": { + "BoundingBox": { + "Height": 0.14488966763019562, + "Left": 0.097494937479496, + "Top": 0.1815125048160553, + "Width": 0.4830120801925659 + }, + "Polygon": [ + { + "X": 0.097494937479496, + "Y": 0.18190857768058777 + }, + { + "X": 0.5804457068443298, + "Y": 0.1815125048160553 + }, + { + "X": 0.5805070400238037, + "Y": 0.3260059356689453 + }, + { + "X": 0.09756318479776382, + "Y": 0.3264021873474121 + } + ] + }, + "Id": "8b70e58e-d61c-426b-8792-3345a907082f", + "Text": "Ascenders", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.15104113519191742, + "Left": 0.6139811277389526, + "Top": 0.17868299782276154, + "Width": 0.14463846385478973 + }, + "Polygon": [ + { + "X": 0.6139811277389526, + "Y": 0.1788015514612198 + }, + { + "X": 0.7585582137107849, + "Y": 0.17868299782276154 + }, + { + "X": 0.7586196064949036, + "Y": 0.32960548996925354 + }, + { + "X": 0.6140446662902832, + "Y": 0.32972413301467896 + } + ] + }, + "Id": "a4d9dbdc-4c38-4316-817d-1268e7cb4df8", + "Text": "On", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.14040860533714294, + "Left": 0.09873093664646149, + "Top": 0.48037731647491455, + "Width": 0.26501893997192383 + }, + "Polygon": [ + { + "X": 0.09873093664646149, + "Y": 0.48059481382369995 + }, + { + "X": 0.3636873662471771, + "Y": 0.48037731647491455 + }, + { + "X": 0.3637498915195465, + "Y": 0.6205683350563049 + }, + { + "X": 0.09879713505506516, + "Y": 0.6207859516143799 + } + ] + }, + "Id": "94a97c49-3b20-47c8-a425-3c8c92fa8f90", + "Text": "query", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.14399538934230804, + "Left": 0.3915230929851532, + "Top": 0.47797122597694397, + "Width": 0.1777207851409912 + }, + "Polygon": [ + { + "X": 0.3915230929851532, + "Y": 0.47811707854270935 + }, + { + "X": 0.5691826939582825, + "Y": 0.47797122597694397 + }, + { + "X": 0.5692439079284668, + "Y": 0.6218206882476807 + }, + { + "X": 0.3915868401527405, + "Y": 0.6219666004180908 + } + ] + }, + "Id": "237b5c3c-9561-4a2d-a870-cfa8a4ae7c90", + "Text": "png", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.09894753247499466, + "Left": 0.09615166485309601, + "Top": 0.7352153062820435, + "Width": 0.13559171557426453 + }, + "Polygon": [ + { + "X": 0.09615166485309601, + "Y": 0.7353266477584839 + }, + { + "X": 0.23169800639152527, + "Y": 0.7352153062820435 + }, + { + "X": 0.23174338042736053, + "Y": 0.8340514302253723 + }, + { + "X": 0.09619835764169693, + "Y": 0.8341628313064575 + } + ] + }, + "Id": "d17465ee-b2a6-4fe6-9932-95be7b0fad79", + "Text": "we", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.10024342685937881, + "Left": 0.2643112242221832, + "Top": 0.7339208722114563, + "Width": 0.16200631856918335 + }, + "Polygon": [ + { + "X": 0.2643112242221832, + "Y": 0.7340539693832397 + }, + { + "X": 0.42627352476119995, + "Y": 0.7339208722114563 + }, + { + "X": 0.4263175427913666, + "Y": 0.8340312242507935 + }, + { + "X": 0.2643568515777588, + "Y": 0.8341643214225769 + } + ] + }, + "Id": "ba62d2ca-9a1f-424f-92b3-a22e7880ae74", + "Text": "can", + "TextType": "PRINTED" + } + ], + "DetectDocumentTextModelVersion": "1.0", + "DocumentMetadata": { + "Pages": 1 + } +} diff --git a/tests/assets/ascenders_descenders_test_AwsTextractLayout.json b/tests/assets/ascenders_descenders_test_AwsTextractLayout.json new file mode 100644 index 0000000..eb54a99 --- /dev/null +++ b/tests/assets/ascenders_descenders_test_AwsTextractLayout.json @@ -0,0 +1,501 @@ +{ + "$metadata": { + "httpStatusCode": 200, + "requestId": "50a39cf1-0deb-4cb8-9c03-842efb33df5c", + "attempts": 1, + "totalRetryDelay": 0 + }, + "AnalyzeDocumentModelVersion": "1.0", + "Blocks": [ + { + "BlockType": "PAGE", + "Geometry": { + "BoundingBox": { + "Height": 1, + "Left": 0, + "Top": 0, + "Width": 1 + }, + "Polygon": [ + { + "X": 0, + "Y": 3.490975757358683e-7 + }, + { + "X": 1, + "Y": 0 + }, + { + "X": 1, + "Y": 1 + }, + { + "X": 0.0000017131864069597214, + "Y": 1 + } + ] + }, + "Id": "afbb8375-f764-412b-a398-e716e6e98b90", + "Relationships": [ + { + "Ids": [ + "11aebad3-6e4a-4b4f-a1ab-4472c590624b", + "4941b103-31e3-45b6-b88e-ea5ec0d940b3", + "e153e383-bef8-498f-84ae-0933f7f7dfd5", + "a262c636-1c75-4dbb-84ad-6b12b8dc203a", + "e3b1bf3e-4410-4b78-a45e-f47527445f86", + "741cbd67-06da-4525-b466-b0ddca3ddda8" + ], + "Type": "CHILD" + } + ] + }, + { + "BlockType": "LINE", + "Confidence": 99.9402084350586, + "Geometry": { + "BoundingBox": { + "Height": 0.15146490931510925, + "Left": 0.09749366343021393, + "Top": 0.17868299782276154, + "Width": 0.661125898361206 + }, + "Polygon": [ + { + "X": 0.09749366343021393, + "Y": 0.1792251169681549 + }, + { + "X": 0.7585582137107849, + "Y": 0.17868299782276154 + }, + { + "X": 0.7586195468902588, + "Y": 0.32960548996925354 + }, + { + "X": 0.0975649505853653, + "Y": 0.3301478922367096 + } + ] + }, + "Id": "11aebad3-6e4a-4b4f-a1ab-4472c590624b", + "Relationships": [ + { + "Ids": [ + "68bd9f9f-ea85-4fa1-b2c1-5046353175af", + "d20c4d45-dbd2-4c21-a330-94da8871896e" + ], + "Type": "CHILD" + } + ], + "Text": "Ascenders On" + }, + { + "BlockType": "LINE", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.14423586428165436, + "Left": 0.09872987866401672, + "Top": 0.47797122597694397, + "Width": 0.4705139994621277 + }, + "Polygon": [ + { + "X": 0.09872987866401672, + "Y": 0.4783574342727661 + }, + { + "X": 0.5691826343536377, + "Y": 0.47797122597694397 + }, + { + "X": 0.569243848323822, + "Y": 0.6218206882476807 + }, + { + "X": 0.09879779815673828, + "Y": 0.6222071051597595 + } + ] + }, + "Id": "4941b103-31e3-45b6-b88e-ea5ec0d940b3", + "Relationships": [ + { + "Ids": [ + "10b52be6-b001-4929-baad-d955700ed904", + "b12a0cc3-6dbd-45ed-8a07-62bc8b658cc7" + ], + "Type": "CHILD" + } + ], + "Text": "query png" + }, + { + "BlockType": "LINE", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.10038164258003235, + "Left": 0.09615112096071243, + "Top": 0.7339208722114563, + "Width": 0.33016639947891235 + }, + "Polygon": [ + { + "X": 0.09615112096071243, + "Y": 0.7341921329498291 + }, + { + "X": 0.42627349495887756, + "Y": 0.7339208722114563 + }, + { + "X": 0.4263175129890442, + "Y": 0.8340312242507935 + }, + { + "X": 0.0961984172463417, + "Y": 0.834302544593811 + } + ] + }, + "Id": "e153e383-bef8-498f-84ae-0933f7f7dfd5", + "Relationships": [ + { + "Ids": [ + "0ebd097c-5e76-46ea-8907-52f82f361619", + "38782fb5-1184-4c64-89c0-64f43e14f401" + ], + "Type": "CHILD" + } + ], + "Text": "we can" + }, + { + "BlockType": "WORD", + "Confidence": 99.88042449951172, + "Geometry": { + "BoundingBox": { + "Height": 0.14488966763019562, + "Left": 0.097494937479496, + "Top": 0.1815125048160553, + "Width": 0.48301205039024353 + }, + "Polygon": [ + { + "X": 0.097494937479496, + "Y": 0.18190857768058777 + }, + { + "X": 0.5804456472396851, + "Y": 0.1815125048160553 + }, + { + "X": 0.5805069804191589, + "Y": 0.3260059356689453 + }, + { + "X": 0.09756317734718323, + "Y": 0.3264021873474121 + } + ], + "RotationAngle": 0 + }, + "Id": "68bd9f9f-ea85-4fa1-b2c1-5046353175af", + "Text": "Ascenders", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.15104113519191742, + "Left": 0.6139811277389526, + "Top": 0.17868299782276154, + "Width": 0.14463846385478973 + }, + "Polygon": [ + { + "X": 0.6139811277389526, + "Y": 0.1788015514612198 + }, + { + "X": 0.7585582137107849, + "Y": 0.17868299782276154 + }, + { + "X": 0.7586195468902588, + "Y": 0.32960548996925354 + }, + { + "X": 0.6140446662902832, + "Y": 0.32972413301467896 + } + ], + "RotationAngle": 0 + }, + "Id": "d20c4d45-dbd2-4c21-a330-94da8871896e", + "Text": "On", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.14040860533714294, + "Left": 0.09873093664646149, + "Top": 0.48037731647491455, + "Width": 0.26501893997192383 + }, + "Polygon": [ + { + "X": 0.09873093664646149, + "Y": 0.48059481382369995 + }, + { + "X": 0.36368733644485474, + "Y": 0.48037731647491455 + }, + { + "X": 0.3637498617172241, + "Y": 0.6205683350563049 + }, + { + "X": 0.09879712760448456, + "Y": 0.6207859516143799 + } + ], + "RotationAngle": 0 + }, + "Id": "10b52be6-b001-4929-baad-d955700ed904", + "Text": "query", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.14399538934230804, + "Left": 0.3915230929851532, + "Top": 0.47797122597694397, + "Width": 0.1777207851409912 + }, + "Polygon": [ + { + "X": 0.3915230929851532, + "Y": 0.47811707854270935 + }, + { + "X": 0.5691826343536377, + "Y": 0.47797122597694397 + }, + { + "X": 0.569243848323822, + "Y": 0.6218206882476807 + }, + { + "X": 0.3915868401527405, + "Y": 0.6219666004180908 + } + ], + "RotationAngle": 0 + }, + "Id": "b12a0cc3-6dbd-45ed-8a07-62bc8b658cc7", + "Text": "png", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.09894753247499466, + "Left": 0.09615165740251541, + "Top": 0.7352153062820435, + "Width": 0.13559171557426453 + }, + "Polygon": [ + { + "X": 0.09615165740251541, + "Y": 0.7353266477584839 + }, + { + "X": 0.23169800639152527, + "Y": 0.7352153062820435 + }, + { + "X": 0.23174336552619934, + "Y": 0.8340514302253723 + }, + { + "X": 0.09619835764169693, + "Y": 0.8341628313064575 + } + ], + "RotationAngle": 0 + }, + "Id": "0ebd097c-5e76-46ea-8907-52f82f361619", + "Text": "we", + "TextType": "PRINTED" + }, + { + "BlockType": "WORD", + "Confidence": 100, + "Geometry": { + "BoundingBox": { + "Height": 0.10024342685937881, + "Left": 0.26431119441986084, + "Top": 0.7339208722114563, + "Width": 0.16200630366802216 + }, + "Polygon": [ + { + "X": 0.26431119441986084, + "Y": 0.7340539693832397 + }, + { + "X": 0.42627349495887756, + "Y": 0.7339208722114563 + }, + { + "X": 0.4263175129890442, + "Y": 0.8340312242507935 + }, + { + "X": 0.2643568217754364, + "Y": 0.8341643214225769 + } + ], + "RotationAngle": 0 + }, + "Id": "38782fb5-1184-4c64-89c0-64f43e14f401", + "Text": "can", + "TextType": "PRINTED" + }, + { + "BlockType": "LAYOUT_TEXT", + "Confidence": 50, + "Geometry": { + "BoundingBox": { + "Height": 0.15146490931510925, + "Left": 0.09749366343021393, + "Top": 0.17868299782276154, + "Width": 0.661125898361206 + }, + "Polygon": [ + { + "X": 0.09749366343021393, + "Y": 0.1792251169681549 + }, + { + "X": 0.7585582137107849, + "Y": 0.17868299782276154 + }, + { + "X": 0.7586195468902588, + "Y": 0.32960548996925354 + }, + { + "X": 0.0975649505853653, + "Y": 0.3301478922367096 + } + ] + }, + "Id": "a262c636-1c75-4dbb-84ad-6b12b8dc203a", + "Relationships": [ + { + "Ids": [ + "11aebad3-6e4a-4b4f-a1ab-4472c590624b" + ], + "Type": "CHILD" + } + ] + }, + { + "BlockType": "LAYOUT_TEXT", + "Confidence": 64.16015625, + "Geometry": { + "BoundingBox": { + "Height": 0.14423586428165436, + "Left": 0.09872987866401672, + "Top": 0.47797122597694397, + "Width": 0.4705139994621277 + }, + "Polygon": [ + { + "X": 0.09872987866401672, + "Y": 0.4783574342727661 + }, + { + "X": 0.5691826343536377, + "Y": 0.47797122597694397 + }, + { + "X": 0.569243848323822, + "Y": 0.6218206882476807 + }, + { + "X": 0.09879779815673828, + "Y": 0.6222071051597595 + } + ] + }, + "Id": "e3b1bf3e-4410-4b78-a45e-f47527445f86", + "Relationships": [ + { + "Ids": [ + "4941b103-31e3-45b6-b88e-ea5ec0d940b3" + ], + "Type": "CHILD" + } + ] + }, + { + "BlockType": "LAYOUT_TEXT", + "Confidence": 57.958984375, + "Geometry": { + "BoundingBox": { + "Height": 0.10038164258003235, + "Left": 0.09615112096071243, + "Top": 0.7339208722114563, + "Width": 0.33016639947891235 + }, + "Polygon": [ + { + "X": 0.09615112096071243, + "Y": 0.7341921329498291 + }, + { + "X": 0.42627349495887756, + "Y": 0.7339208722114563 + }, + { + "X": 0.4263175129890442, + "Y": 0.8340312242507935 + }, + { + "X": 0.0961984172463417, + "Y": 0.834302544593811 + } + ] + }, + "Id": "741cbd67-06da-4525-b466-b0ddca3ddda8", + "Relationships": [ + { + "Ids": [ + "e153e383-bef8-498f-84ae-0933f7f7dfd5" + ], + "Type": "CHILD" + } + ] + } + ], + "DocumentMetadata": { + "Pages": 1 + } +} diff --git a/tests/assets/ascenders_descenders_test_GoogleVision.json b/tests/assets/ascenders_descenders_test_GoogleVision.json new file mode 100644 index 0000000..d580a41 --- /dev/null +++ b/tests/assets/ascenders_descenders_test_GoogleVision.json @@ -0,0 +1,1289 @@ +{ + "faceAnnotations": [], + "landmarkAnnotations": [], + "logoAnnotations": [], + "labelAnnotations": [], + "textAnnotations": [ + { + "locations": [], + "properties": [], + "mid": "", + "locale": "en", + "description": "Ascenders On\nquery png\nwe can", + "score": 0, + "confidence": 0, + "topicality": 0, + "boundingPoly": { + "vertices": [ + { + "x": 61, + "y": 70 + }, + { + "x": 493, + "y": 70 + }, + { + "x": 493, + "y": 320 + }, + { + "x": 61, + "y": 320 + } + ], + "normalizedVertices": [] + } + }, + { + "locations": [], + "properties": [], + "mid": "", + "locale": "", + "description": "Ascenders", + "score": 0, + "confidence": 0, + "topicality": 0, + "boundingPoly": { + "vertices": [ + { + "x": 61, + "y": 70 + }, + { + "x": 376, + "y": 70 + }, + { + "x": 376, + "y": 123 + }, + { + "x": 61, + "y": 123 + } + ], + "normalizedVertices": [] + } + }, + { + "locations": [], + "properties": [], + "mid": "", + "locale": "", + "description": "On", + "score": 0, + "confidence": 0, + "topicality": 0, + "boundingPoly": { + "vertices": [ + { + "x": 398, + "y": 70 + }, + { + "x": 493, + "y": 70 + }, + { + "x": 493, + "y": 123 + }, + { + "x": 398, + "y": 123 + } + ], + "normalizedVertices": [] + } + }, + { + "locations": [], + "properties": [], + "mid": "", + "locale": "", + "description": "query", + "score": 0, + "confidence": 0, + "topicality": 0, + "boundingPoly": { + "vertices": [ + { + "x": 61, + "y": 186 + }, + { + "x": 229, + "y": 186 + }, + { + "x": 229, + "y": 232 + }, + { + "x": 61, + "y": 232 + } + ], + "normalizedVertices": [] + } + }, + { + "locations": [], + "properties": [], + "mid": "", + "locale": "", + "description": "png", + "score": 0, + "confidence": 0, + "topicality": 0, + "boundingPoly": { + "vertices": [ + { + "x": 254, + "y": 186 + }, + { + "x": 364, + "y": 186 + }, + { + "x": 364, + "y": 232 + }, + { + "x": 254, + "y": 232 + } + ], + "normalizedVertices": [] + } + }, + { + "locations": [], + "properties": [], + "mid": "", + "locale": "", + "description": "we", + "score": 0, + "confidence": 0, + "topicality": 0, + "boundingPoly": { + "vertices": [ + { + "x": 63, + "y": 282 + }, + { + "x": 151, + "y": 280 + }, + { + "x": 152, + "y": 318 + }, + { + "x": 64, + "y": 320 + } + ], + "normalizedVertices": [] + } + }, + { + "locations": [], + "properties": [], + "mid": "", + "locale": "", + "description": "can", + "score": 0, + "confidence": 0, + "topicality": 0, + "boundingPoly": { + "vertices": [ + { + "x": 168, + "y": 279 + }, + { + "x": 275, + "y": 277 + }, + { + "x": 276, + "y": 316 + }, + { + "x": 169, + "y": 318 + } + ], + "normalizedVertices": [] + } + } + ], + "localizedObjectAnnotations": [], + "safeSearchAnnotation": null, + "imagePropertiesAnnotation": null, + "error": null, + "cropHintsAnnotation": null, + "fullTextAnnotation": { + "pages": [ + { + "blocks": [ + { + "paragraphs": [ + { + "words": [ + { + "symbols": [ + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 70 + }, + { + "x": 113, + "y": 70 + }, + { + "x": 113, + "y": 123 + }, + { + "x": 61, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "A", + "confidence": 0.9880886077880859 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 116, + "y": 70 + }, + { + "x": 145, + "y": 70 + }, + { + "x": 145, + "y": 123 + }, + { + "x": 116, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "s", + "confidence": 0.9907979369163513 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 147, + "y": 70 + }, + { + "x": 177, + "y": 70 + }, + { + "x": 177, + "y": 123 + }, + { + "x": 147, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "c", + "confidence": 0.990847647190094 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 180, + "y": 70 + }, + { + "x": 211, + "y": 70 + }, + { + "x": 211, + "y": 123 + }, + { + "x": 180, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "e", + "confidence": 0.9925042986869812 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 212, + "y": 70 + }, + { + "x": 250, + "y": 70 + }, + { + "x": 250, + "y": 123 + }, + { + "x": 212, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "n", + "confidence": 0.9937740564346313 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 251, + "y": 70 + }, + { + "x": 287, + "y": 70 + }, + { + "x": 287, + "y": 123 + }, + { + "x": 251, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "d", + "confidence": 0.9944121241569519 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 290, + "y": 70 + }, + { + "x": 321, + "y": 70 + }, + { + "x": 321, + "y": 123 + }, + { + "x": 290, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "e", + "confidence": 0.9942989349365234 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 321, + "y": 70 + }, + { + "x": 348, + "y": 70 + }, + { + "x": 348, + "y": 123 + }, + { + "x": 321, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "r", + "confidence": 0.9914131164550781 + }, + { + "property": { + "detectedLanguages": [], + "detectedBreak": { + "type": "SPACE", + "isPrefix": false + } + }, + "boundingBox": { + "vertices": [ + { + "x": 350, + "y": 70 + }, + { + "x": 376, + "y": 70 + }, + { + "x": 376, + "y": 123 + }, + { + "x": 350, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "s", + "confidence": 0.985426664352417 + } + ], + "property": { + "detectedLanguages": [ + { + "languageCode": "en", + "confidence": 1 + } + ], + "detectedBreak": null + }, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 70 + }, + { + "x": 376, + "y": 70 + }, + { + "x": 376, + "y": 123 + }, + { + "x": 61, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9912848472595215 + }, + { + "symbols": [ + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 398, + "y": 70 + }, + { + "x": 447, + "y": 70 + }, + { + "x": 447, + "y": 123 + }, + { + "x": 398, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "O", + "confidence": 0.9466485977172852 + }, + { + "property": { + "detectedLanguages": [], + "detectedBreak": { + "type": "LINE_BREAK", + "isPrefix": false + } + }, + "boundingBox": { + "vertices": [ + { + "x": 449, + "y": 70 + }, + { + "x": 493, + "y": 70 + }, + { + "x": 493, + "y": 123 + }, + { + "x": 449, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "text": "n", + "confidence": 0.8815884590148926 + } + ], + "property": { + "detectedLanguages": [ + { + "languageCode": "en", + "confidence": 1 + } + ], + "detectedBreak": null + }, + "boundingBox": { + "vertices": [ + { + "x": 398, + "y": 70 + }, + { + "x": 493, + "y": 70 + }, + { + "x": 493, + "y": 123 + }, + { + "x": 398, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9141185283660889 + } + ], + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 70 + }, + { + "x": 493, + "y": 70 + }, + { + "x": 493, + "y": 123 + }, + { + "x": 61, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9772545695304871 + } + ], + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 70 + }, + { + "x": 493, + "y": 70 + }, + { + "x": 493, + "y": 123 + }, + { + "x": 61, + "y": 123 + } + ], + "normalizedVertices": [] + }, + "blockType": "TEXT", + "confidence": 0.9772545695304871 + }, + { + "paragraphs": [ + { + "words": [ + { + "symbols": [ + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 186 + }, + { + "x": 93, + "y": 186 + }, + { + "x": 93, + "y": 232 + }, + { + "x": 61, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "q", + "confidence": 0.9628907442092896 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 101, + "y": 186 + }, + { + "x": 132, + "y": 186 + }, + { + "x": 132, + "y": 232 + }, + { + "x": 101, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "u", + "confidence": 0.9331845045089722 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 137, + "y": 186 + }, + { + "x": 165, + "y": 186 + }, + { + "x": 165, + "y": 232 + }, + { + "x": 137, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "e", + "confidence": 0.951951265335083 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 167, + "y": 186 + }, + { + "x": 193, + "y": 186 + }, + { + "x": 193, + "y": 232 + }, + { + "x": 167, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "r", + "confidence": 0.8825578689575195 + }, + { + "property": { + "detectedLanguages": [], + "detectedBreak": { + "type": "SPACE", + "isPrefix": false + } + }, + "boundingBox": { + "vertices": [ + { + "x": 196, + "y": 186 + }, + { + "x": 229, + "y": 186 + }, + { + "x": 229, + "y": 232 + }, + { + "x": 196, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "y", + "confidence": 0.9319852590560913 + } + ], + "property": { + "detectedLanguages": [ + { + "languageCode": "en", + "confidence": 1 + } + ], + "detectedBreak": null + }, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 186 + }, + { + "x": 229, + "y": 186 + }, + { + "x": 229, + "y": 232 + }, + { + "x": 61, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.932513952255249 + }, + { + "symbols": [ + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 254, + "y": 186 + }, + { + "x": 288, + "y": 186 + }, + { + "x": 288, + "y": 232 + }, + { + "x": 254, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "p", + "confidence": 0.9672189354896545 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 292, + "y": 186 + }, + { + "x": 323, + "y": 186 + }, + { + "x": 323, + "y": 232 + }, + { + "x": 292, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "n", + "confidence": 0.9569672346115112 + }, + { + "property": { + "detectedLanguages": [], + "detectedBreak": { + "type": "LINE_BREAK", + "isPrefix": false + } + }, + "boundingBox": { + "vertices": [ + { + "x": 333, + "y": 186 + }, + { + "x": 364, + "y": 186 + }, + { + "x": 364, + "y": 232 + }, + { + "x": 333, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "text": "g", + "confidence": 0.9840278625488281 + } + ], + "property": { + "detectedLanguages": [ + { + "languageCode": "en", + "confidence": 1 + } + ], + "detectedBreak": null + }, + "boundingBox": { + "vertices": [ + { + "x": 254, + "y": 186 + }, + { + "x": 364, + "y": 186 + }, + { + "x": 364, + "y": 232 + }, + { + "x": 254, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9694046974182129 + } + ], + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 186 + }, + { + "x": 364, + "y": 186 + }, + { + "x": 364, + "y": 232 + }, + { + "x": 61, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9463479518890381 + } + ], + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 61, + "y": 186 + }, + { + "x": 364, + "y": 186 + }, + { + "x": 364, + "y": 232 + }, + { + "x": 61, + "y": 232 + } + ], + "normalizedVertices": [] + }, + "blockType": "TEXT", + "confidence": 0.9463479518890381 + }, + { + "paragraphs": [ + { + "words": [ + { + "symbols": [ + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 63, + "y": 282 + }, + { + "x": 116, + "y": 281 + }, + { + "x": 117, + "y": 319 + }, + { + "x": 64, + "y": 320 + } + ], + "normalizedVertices": [] + }, + "text": "w", + "confidence": 0.9548997282981873 + }, + { + "property": { + "detectedLanguages": [], + "detectedBreak": { + "type": "SPACE", + "isPrefix": false + } + }, + "boundingBox": { + "vertices": [ + { + "x": 114, + "y": 281 + }, + { + "x": 151, + "y": 280 + }, + { + "x": 152, + "y": 318 + }, + { + "x": 115, + "y": 319 + } + ], + "normalizedVertices": [] + }, + "text": "e", + "confidence": 0.9906131029129028 + } + ], + "property": { + "detectedLanguages": [ + { + "languageCode": "en", + "confidence": 1 + } + ], + "detectedBreak": null + }, + "boundingBox": { + "vertices": [ + { + "x": 63, + "y": 282 + }, + { + "x": 151, + "y": 280 + }, + { + "x": 152, + "y": 318 + }, + { + "x": 64, + "y": 320 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9727563858032227 + }, + { + "symbols": [ + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 168, + "y": 280 + }, + { + "x": 200, + "y": 279 + }, + { + "x": 201, + "y": 317 + }, + { + "x": 169, + "y": 318 + } + ], + "normalizedVertices": [] + }, + "text": "c", + "confidence": 0.9940080046653748 + }, + { + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 202, + "y": 279 + }, + { + "x": 237, + "y": 278 + }, + { + "x": 238, + "y": 316 + }, + { + "x": 203, + "y": 317 + } + ], + "normalizedVertices": [] + }, + "text": "a", + "confidence": 0.994207501411438 + }, + { + "property": { + "detectedLanguages": [], + "detectedBreak": { + "type": "LINE_BREAK", + "isPrefix": false + } + }, + "boundingBox": { + "vertices": [ + { + "x": 235, + "y": 278 + }, + { + "x": 275, + "y": 277 + }, + { + "x": 276, + "y": 315 + }, + { + "x": 236, + "y": 316 + } + ], + "normalizedVertices": [] + }, + "text": "n", + "confidence": 0.9928907752037048 + } + ], + "property": { + "detectedLanguages": [ + { + "languageCode": "en", + "confidence": 1 + } + ], + "detectedBreak": null + }, + "boundingBox": { + "vertices": [ + { + "x": 168, + "y": 279 + }, + { + "x": 275, + "y": 277 + }, + { + "x": 276, + "y": 316 + }, + { + "x": 169, + "y": 318 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9937021136283875 + } + ], + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 63, + "y": 281 + }, + { + "x": 275, + "y": 277 + }, + { + "x": 276, + "y": 316 + }, + { + "x": 64, + "y": 320 + } + ], + "normalizedVertices": [] + }, + "confidence": 0.9853238463401794 + } + ], + "property": null, + "boundingBox": { + "vertices": [ + { + "x": 63, + "y": 281 + }, + { + "x": 275, + "y": 277 + }, + { + "x": 276, + "y": 316 + }, + { + "x": 64, + "y": 320 + } + ], + "normalizedVertices": [] + }, + "blockType": "TEXT", + "confidence": 0.9853238463401794 + } + ], + "property": { + "detectedLanguages": [ + { + "languageCode": "en", + "confidence": 1 + } + ], + "detectedBreak": null + }, + "width": 648, + "height": 382, + "confidence": 0.969642162322998 + } + ], + "text": "Ascenders On\nquery png\nwe can" + }, + "webDetection": null, + "productSearchResults": null, + "context": null +} diff --git a/tests/assets/border_patrol_tables_Abbyy.xml b/tests/assets/border_patrol_tables_Abbyy.xml new file mode 100644 index 0000000..383a3e5 --- /dev/null +++ b/tests/assets/border_patrol_tables_Abbyy.xml @@ -0,0 +1,8544 @@ + + + + + + + + + +B +O +R +D + +P +A +T +R + + + + + + +U +n +i +t +e +d + +S +t +a +t +e +s + +B +o +r +d +e +r + +P +a +t +r +o +l + + +S +e +c +t +o +r + +P +r +o +f +i +l +e + +- + +F +i +s +c +a +l + +Y +e +a +r + +2 +0 +1 +9 + +( +o +c +t +. + +1 +s +t + +t +h +r +o +u +g +h + +s +e +p +t +. + +3 +0 +t +h +) + + + + + + + + +S +E +C +T +O +R + + + + + +A +g +e +n +t + +S +t +a +f +f +i +n +g +* + + + + + +A +p +p +r +e +h +e +n +s +i +o +n +s + + + + + +O +t +h +e +r + +T +h +a +n + +M +e +x +i +c +a +n + +A +p +p +r +e +h +e +n +s +i +o +n +s + + + + + +M +a +r +i +j +u +a +n +a + +( +p +o +u +n +d +s +) + + + + + +C +o +c +a +i +n +e + +( +p +o +u +n +d +s +) + + + + + +A +c +c +e +p +t +e +d + +P +r +o +s +e +c +u +t +i +o +n +s + + + + + +A +s +s +a +u +l +t +s + + + + + +R +e +s +c +u +e +s + + + + + +B +o +r +d +e +r + + +D +e +a +t +h +s + + + + + + +M +i +a +m +i + + + + + +1 +2 +7 + + + + + +1 +, +8 +9 +1 + + + + + +1 +, +3 +5 +8 + + + + + +5 +9 +4 + + + + + +6 +5 +0 + + + + + +2 +4 +1 + + + + + +0 + + + + + +0 + + + + + +0 + + + + + + +N +e +w + +O +r +l +e +a +n +s + + + + + +6 +9 + + + + + +1 +, +1 +3 +2 + + + + + +6 +8 +9 + + + + + +9 + + + + + +2 +2 + + + + + +8 +9 + + + + + +1 + + + + + +0 + + + + + +0 + + + + + + +R +a +m +e +y + + + + + +5 +9 + + + + + +5 +6 +2 + + + + + +5 +6 +1 + + + + + +2 +5 +6 + + + + + +6 +, +1 +7 +1 + + + + + +1 +5 +2 + + + + + +0 + + + + + +0 + + + + + +0 + + + + + + +C +o +a +s +t +a +l + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +2 +5 +5 + + + + + +3 +, +5 +8 +5 + + + + + +2 +, +6 +0 +8 + + + + + +8 +5 +9 + + + + + +6 +, +8 +4 +3 + + + + + +4 +8 +2 + + + + + +1 + + + + + +0 + + + + + +0 + + + + + + +B +l +a +i +n +e + + + + + +2 +9 +2 + + + + + +5 +2 +4 + + + + + +3 +8 +0 + + + + + +0 + + + + + +0 + + + + + +2 +5 + + + + + +2 + + + + + +0 + + + + + +0 + + + + + + +B +u +f +f +a +l +o + + + + + +2 +7 +3 + + + + + +5 +3 +7 + + + + + +4 +2 +5 + + + + + +6 +1 +1 + + + + + +5 + + + + + +2 +8 + + + + + +0 + + + + + +0 + + + + + +0 + + + + + + +D +e +t +r +o +i +t + + + + + +4 +0 +4 + + + + + +1 +, +3 +2 +2 + + + + + +4 +9 +1 + + + + + +6 + + + + + +6 + + + + + +1 +1 +2 + + + + + +0 + + + + + +0 + + + + + +0 + + + + + + +G +r +a +n +d + +F +o +r +k +s + + + + + +1 +8 +5 + + + + + +4 +1 +2 + + + + + +1 +3 +6 + + + + + +0 + + + + + +0 + + + + + +5 +2 + + + + + +1 + + + + + +0 + + + + + +0 + + + + + + +H +a +v +r +e + + + + + +1 +7 +2 + + + + + +7 +7 + + + + + +3 +8 + + + + + +2 + + + + + +1 + + + + + +1 +2 + + + + + +0 + + + + + +0 + + + + + +0 + + + + + + +H +o +u +l +t +o +n + + + + + +1 +9 +3 + + + + + +5 +2 + + + + + +2 +5 + + + + + +4 +3 + + + + + +0 + + + + + +1 +6 + + + + + +1 + + + + + +0 + + + + + +0 + + + + + + +S +p +o +k +a +n +e + + + + + +2 +5 +3 + + + + + +4 +2 +8 + + + + + +9 +2 + + + + + +6 + + + + + +2 + + + + + +5 +9 + + + + + +0 + + + + + +0 + + + + + +0 + + + + + + +S +w +a +n +t +o +n + + + + + +3 +0 +1 + + + + + +1 +, +0 +5 +6 + + + + + +7 +2 +0 + + + + + +2 +, +4 +5 +2 + + + + + +1 + + + + + +5 +3 +3 + + + + + +2 + + + + + +0 + + + + + +0 + + + + + + +N +o +r +t +h +e +r +n + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +2 +, +0 +7 +3 + + + + + +4 +, +4 +0 +8 + + + + + +2 +, +3 +0 +7 + + + + + +3 +, +1 +2 +0 + + + + + +1 +5 + + + + + +8 +3 +7 + + + + + +6 + + + + + +0 + + + + + +0 + + + + + + +B +i +g + +B +e +n +d + +( +f +o +r +m +e +r +l +y + +m +a +r +f +a +) + + + + + +5 +3 +5 + + + + + +9 +, +6 +3 +7 + + + + + +6 +, +2 +0 +6 + + + + + +2 +6 +, +1 +7 +6 + + + + + +5 +3 + + + + + +4 +, +5 +3 +8 + + + + + +3 + + + + + +4 +5 + + + + + +3 + + + + + + +D +e +l + +R +i +o + + + + + +1 +, +4 +3 +6 + + + + + +5 +7 +, +2 +6 +9 + + + + + +4 +8 +, +0 +2 +8 + + + + + +4 +1 + + + + + +1 +5 + + + + + +1 +1 +, +8 +9 +2 + + + + + +4 +3 + + + + + +4 +7 +6 + + + + + +3 +8 + + + + + + +E +l + +C +e +n +t +r +o + + + + + +8 +6 +3 + + + + + +3 +5 +, +1 +3 +8 + + + + + +1 +5 +, +8 +6 +9 + + + + + +1 +9 +9 + + + + + +6 +2 + + + + + +3 +, +3 +7 +1 + + + + + +1 +5 +6 + + + + + +7 +8 + + + + + +1 +7 + + + + + + +E +l + +P +a +s +o + + + + + +2 +, +1 +6 +0 + + + + + +1 +8 +2 +, +1 +4 +3 + + + + + +1 +6 +6 +, +9 +6 +0 + + + + + +1 +1 +, +1 +4 +7 + + + + + +1 +3 +8 + + + + + +1 +5 +, +1 +5 +5 + + + + + +3 +6 + + + + + +4 +0 + + + + + +2 +0 + + + + + + +L +a +r +e +d +o + + + + + +1 +, +7 +8 +8 + + + + + +3 +8 +, +3 +7 +8 + + + + + +1 +2 +, +1 +6 +9 + + + + + +3 +6 +, +7 +3 +8 + + + + + +5 +2 +8 + + + + + +1 +5 +, +5 +4 +7 + + + + + +7 +2 + + + + + +2 +, +4 +5 +3 + + + + + +7 +8 + + + + + + +R +i +o + +G +r +a +n +d +e + +V +a +l +l +e +y + +( +f +o +r +m +e +r +l +y + +m +c +a +l +l +e +n +) + + + + + +3 +, +1 +0 +5 + + + + + +3 +3 +9 +, +1 +3 +5 + + + + + +3 +0 +9 +, +2 +9 +5 + + + + + +1 +2 +2 +, +7 +6 +9 + + + + + +2 +, +5 +9 +4 + + + + + +2 +6 +, +8 +0 +1 + + + + + +1 +2 +8 + + + + + +7 +9 +4 + + + + + +6 +9 + + + + + + +S +a +n + +D +i +e +g +o + + + + + +2 +, +2 +1 +4 + + + + + +5 +8 +, +0 +4 +9 + + + + + +2 +7 +, +2 +5 +3 + + + + + +3 +, +3 +0 +2 + + + + + +1 +, +2 +8 +5 + + + + + +8 +, +0 +6 +6 + + + + + +1 +5 +7 + + + + + +1 +9 + + + + + +7 + + + + + + +T +u +c +s +o +n + + + + + +3 +, +6 +9 +5 + + + + + +6 +3 +, +4 +9 +0 + + + + + +3 +4 +, +2 +9 +6 + + + + + +5 +9 +, +4 +8 +0 + + + + + +1 +5 +1 + + + + + +3 +1 +, +7 +6 +9 + + + + + +9 +6 + + + + + +9 +2 +4 + + + + + +6 +1 + + + + + + +Y +u +m +a + + + + + +8 +0 +9 + + + + + +6 +8 +, +2 +6 +9 + + + + + +6 +4 +, +9 +7 +4 + + + + + +3 +, +0 +5 +1 + + + + + +0 + + + + + +3 +, +8 +6 +7 + + + + + +2 +7 + + + + + +8 +2 + + + + + +7 + + + + + + +S +o +u +t +h +w +e +s +t + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l +* +* + + + + + +1 +6 +, +7 +3 +1 + + + + + +8 +5 +1 +, +5 +0 +8 + + + + + +6 +8 +5 +, +0 +5 +0 + + + + + +2 +6 +2 +, +9 +0 +3 + + + + + +4 +, +8 +2 +6 + + + + + +1 +2 +1 +, +0 +0 +6 + + + + + +7 +1 +8 + + + + + +4 +, +9 +1 +1 + + + + + +3 +0 +0 + + + + + + +N +a +t +i +o +n +w +i +d +e + +T +o +t +a +l +* +* +* + + + + + +1 +9 +, +6 +4 +8 + + + + + +8 +5 +9 +, +5 +0 +1 + + + + + +6 +8 +9 +, +9 +6 +5 + + + + + +2 +6 +6 +, +8 +8 +2 + + + + + +1 +1 +, +6 +8 +4 + + + + + +1 +2 +2 +, +3 +2 +5 + + + + + +7 +2 +5 + + + + + +4 +, +9 +1 +1 + + + + + +3 +0 +0 + + + + + + +* + +A +g +e +n +t + +s +t +a +f +f +i +n +g + +s +t +a +t +i +s +t +i +c +s + +d +e +p +i +c +t + +F +Y +1 +9 + +o +n +- +b +o +a +r +d + +p +e +r +s +o +n +n +e +l + +d +a +t +a + +a +s + +o +f + +0 +9 +/ +3 +0 +/ +2 +0 +1 +9 + + + + + + +* +* + +S +o +u +t +h +w +e +s +t + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +s +t +a +f +f +i +n +g + +s +t +a +t +i +s +t +i +c +s + +i +n +c +l +u +d +e +: + +B +i +g + +B +e +n +d +, + +D +e +l + +R +i +o +, + +E +l + +C +e +n +t +r +o +, + +E +l + +P +a +s +o +, + +L +a +r +e +d +o +, + +R +i +o + +G +r +a +n +d +e + +V +a +l +l +e +y +, + +S +a +n + +D +i +e +g +o +, + +T +u +c +s +o +n +, + +Y +u +m +a +, + +a +n +d + +t +h +e + +S +p +e +c +i +a +l + +O +p +e +r +a +t +i +o +n +s + +G +r +o +u +p +. + + + + + + +* +* +* + +N +a +t +i +o +n +w +i +d +e + +s +t +a +f +f +i +n +g + +s +t +a +t +i +s +t +i +c +s + +i +n +c +l +u +d +e +: + +a +l +l + +o +n +- +b +o +a +r +d + +b +o +r +d +e +r + +p +a +t +r +o +l + +a +g +e +n +t +s + +i +n + +c +b +p + + + + + + +* +* +* +* + +R +e +s +c +u +e + +a +n +d + +D +e +a +t +h + +s +t +a +t +i +s +t +i +c +s + +a +r +e + +n +o +t + +t +r +a +c +k +e +d + +f +o +r + +N +o +r +t +h +e +r +n + +a +n +d + +C +o +a +s +t +a +l + +B +o +r +d +e +r + +S +e +c +t +o +r +s +. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +B +O +R +D + + +- +P +A +T +R + + + + + + +U +n +i +t +e +d + +S +t +a +t +e +s + +B +o +r +d +e +r + +P +a +t +r +o +l + + +J +u +v +e +n +i +l +e + +( +0 +- +1 +7 + +Y +e +a +r +s + +O +l +d +) + +a +n +d + +A +d +u +l +t + +A +p +p +r +e +h +e +n +s +i +o +n +s + +- + +F +i +s +c +a +l + +Y +e +a +r + +2 +0 +1 +9 + +( +O +c +t +. + +1 +s +t + +t +h +r +o +u +g +h + +S +e +p +t +. + +3 +0 +t +h +) + + + + + + + + +S +E +C +T +O +R + + + + + +A +c +c +o +m +p +a +n +i +e +d + +J +u +v +e +n +i +l +e +s + + + + + +U +n +a +c +c +o +m +p +a +n +i +e +d + +J +u +v +e +n +i +l +e +s + + + + + +T +o +t +a +l + +J +u +v +e +n +i +l +e +s + + + + + +T +o +t +a +l + +A +d +u +l +t +s + + + + + +T +o +t +a +l + +A +p +p +r +e +h +e +n +s +i +o +n +s + + + + + + +M +i +a +m +i + + + + + +1 +0 + + + + + +2 +3 + + + + + +3 +3 + + + + + +1 +, +8 +5 +8 + + + + + +1 +, +8 +9 +1 + + + + + + +N +e +w + +O +r +l +e +a +n +s + + + + + +1 +0 + + + + + +3 +5 + + + + + +4 +5 + + + + + +1 +, +0 +8 +7 + + + + + +1 +, +1 +3 +2 + + + + + + +R +a +m +e +y + + + + + +1 + + + + + +9 + + + + + +1 +0 + + + + + +5 +5 +2 + + + + + +5 +6 +2 + + + + + + +C +o +a +s +t +a +l + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +2 +1 + + + + + +6 +7 + + + + + +8 +8 + + + + + +3 +, +4 +9 +7 + + + + + +3 +, +5 +8 +5 + + + + + + +B +l +a +i +n +e + + + + + +4 +8 + + + + + +6 + + + + + +5 +4 + + + + + +4 +7 +0 + + + + + +5 +2 +4 + + + + + + +B +u +f +f +a +l +o + + + + + +1 +4 + + + + + +5 + + + + + +1 +9 + + + + + +5 +1 +8 + + + + + +5 +3 +7 + + + + + + +D +e +t +r +o +i +t + + + + + +5 + + + + + +1 +1 + + + + + +1 +6 + + + + + +1 +, +3 +0 +6 + + + + + +1 +, +3 +2 +2 + + + + + + +G +r +a +n +d + +F +o +r +k +s + + + + + +3 + + + + + +6 + + + + + +9 + + + + + +4 +0 +3 + + + + + +4 +1 +2 + + + + + + +H +a +v +r +e + + + + + +0 + + + + + +0 + + + + + +0 + + + + + +7 +7 + + + + + +7 +7 + + + + + + +H +o +u +l +t +o +n + + + + + +0 + + + + + +0 + + + + + +0 + + + + + +5 +2 + + + + + +5 +2 + + + + + + +S +p +o +k +a +n +e + + + + + +3 + + + + + +3 + + + + + +6 + + + + + +4 +2 +2 + + + + + +4 +2 +8 + + + + + + +S +w +a +n +t +o +n + + + + + +1 +4 +3 + + + + + +1 +8 + + + + + +1 +6 +1 + + + + + +8 +9 +5 + + + + + +1 +, +0 +5 +6 + + + + + + +N +o +r +t +h +e +r +n + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +2 +1 +6 + + + + + +4 +9 + + + + + +2 +6 +5 + + + + + +4 +, +1 +4 +3 + + + + + +4 +, +4 +0 +8 + + + + + + +B +i +g + +B +e +n +d + +( +f +o +r +m +e +r +l +y + +M +a +r +f +a +) + + + + + +1 +, +5 +2 +9 + + + + + +7 +7 +9 + + + + + +2 +, +3 +0 +8 + + + + + +7 +3 +2 +9 + + + + + +9 +, +6 +3 +7 + + + + + + +D +e +l + +R +i +o + + + + + +1 +7 +, +0 +8 +4 + + + + + +3 +, +6 +2 +1 + + + + + +2 +0 +, +7 +0 +5 + + + + + +3 +6 +5 +6 +4 + + + + + +5 +7 +, +2 +6 +9 + + + + + + +E +l + +C +e +n +t +r +o + + + + + +4 +, +1 +2 +8 + + + + + +2 +, +6 +8 +8 + + + + + +6 +, +8 +1 +6 + + + + + +2 +8 +3 +2 +2 + + + + + +3 +5 +, +1 +3 +8 + + + + + + +E +l + +P +a +s +o + + + + + +6 +7 +, +9 +2 +2 + + + + + +1 +6 +, +1 +5 +9 + + + + + +8 +4 +, +0 +8 +1 + + + + + +9 +8 +0 +6 +2 + + + + + +1 +8 +2 +, +1 +4 +3 + + + + + + +L +a +r +e +d +o + + + + + +6 +0 +2 + + + + + +2 +, +5 +2 +1 + + + + + +3 +, +1 +2 +3 + + + + + +3 +5 +2 +5 +5 + + + + + +3 +8 +, +3 +7 +8 + + + + + + +R +i +o + +G +r +a +n +d +e + +V +a +l +l +e +y + +( +f +o +r +m +e +r +l +y + +M +c +A +l +l +e +n +) + + + + + +1 +0 +9 +, +8 +9 +5 + + + + + +3 +4 +, +5 +2 +3 + + + + + +1 +4 +4 +, +4 +1 +8 + + + + + +1 +9 +4 +7 +1 +7 + + + + + +3 +3 +9 +, +1 +3 +5 + + + + + + +S +a +n + +D +i +e +g +o + + + + + +8 +, +5 +8 +4 + + + + + +3 +, +3 +3 +5 + + + + + +1 +1 +, +9 +1 +9 + + + + + +4 +6 +1 +3 +0 + + + + + +5 +8 +, +0 +4 +9 + + + + + + +T +u +c +s +o +n + + + + + +8 +, +4 +1 +3 + + + + + +5 +, +1 +0 +5 + + + + + +1 +3 +, +5 +1 +8 + + + + + +4 +9 +9 +7 +2 + + + + + +6 +3 +, +4 +9 +0 + + + + + + +Y +u +m +a + + + + + +2 +7 +, +1 +7 +8 + + + + + +7 +, +2 +8 +9 + + + + + +3 +4 +, +4 +6 +7 + + + + + +3 +3 +8 +0 +2 + + + + + +6 +8 +, +2 +6 +9 + + + + + + +S +o +u +t +h +w +e +s +t + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +2 +4 +5 +, +3 +3 +5 + + + + + +7 +6 +, +0 +2 +0 + + + + + +3 +2 +1 +, +3 +5 +5 + + + + + +5 +3 +0 +, +1 +5 +3 + + + + + +8 +5 +1 +, +5 +0 +8 + + + + + + +N +a +t +i +o +n +w +i +d +e + +T +o +t +a +l + + + + + +2 +4 +5 +, +5 +7 +2 + + + + + +7 +6 +, +1 +3 +6 + + + + + +3 +2 +1 +, +7 +0 +8 + + + + + +5 +3 +7 +, +7 +9 +3 + + + + + +8 +5 +9 +, +5 +0 +1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +B +O +R +D + +P +A +T +R + + + + + + +U +n +i +t +e +d + +S +t +a +t +e +s + +B +o +r +d +e +r + +P +a +t +r +o +l + + +A +p +p +r +e +h +e +n +s +i +o +n +s + +b +y + +G +e +n +d +e +r + +- + +F +i +s +c +a +l + +Y +e +a +r + +2 +0 +1 +9 + +( +O +c +t +. + +1 +s +t + +t +h +r +o +u +g +h + +S +e +p +t +. + +3 +0 +t +h +) + + + + + + + + +S +E +C +T +O +R + + + + + +F +e +m +a +l +e + + + + + +M +a +l +e + + + + + +T +o +t +a +l + +A +p +p +r +e +h +e +n +s +i +o +n +s + + + + + + +M +i +a +m +i + + + + + +2 +2 +0 + + + + + +1 +, +6 +7 +1 + + + + + +1 +, +8 +9 +1 + + + + + + +N +e +w + +O +r +l +e +a +n +s + + + + + +7 +3 + + + + + +1 +, +0 +5 +9 + + + + + +1 +, +1 +3 +2 + + + + + + +R +a +m +e +y + + + + + +7 +6 + + + + + +4 +8 +6 + + + + + +5 +6 +2 + + + + + + +C +o +a +s +t +a +l + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +3 +6 +9 + + + + + +3 +, +2 +1 +6 + + + + + +3 +, +5 +8 +5 + + + + + + +B +l +a +i +n +e + + + + + +1 +6 +2 + + + + + +3 +6 +2 + + + + + +5 +2 +4 + + + + + + +B +u +f +f +a +l +o + + + + + +1 +6 +4 + + + + + +3 +7 +3 + + + + + +5 +3 +7 + + + + + + +D +e +t +r +o +i +t + + + + + +1 +2 +9 + + + + + +1 +, +1 +9 +3 + + + + + +1 +, +3 +2 +2 + + + + + + +G +r +a +n +d + +F +o +r +k +s + + + + + +3 +9 + + + + + +3 +7 +3 + + + + + +4 +1 +2 + + + + + + +H +a +v +r +e + + + + + +1 +4 + + + + + +6 +3 + + + + + +7 +7 + + + + + + +H +o +u +l +t +o +n + + + + + +5 + + + + + +4 +7 + + + + + +5 +2 + + + + + + +S +p +o +k +a +n +e + + + + + +4 +7 + + + + + +3 +8 +1 + + + + + +4 +2 +8 + + + + + + +S +w +a +n +t +o +n + + + + + +3 +0 +7 + + + + + +7 +4 +9 + + + + + +1 +, +0 +5 +6 + + + + + + +N +o +r +t +h +e +r +n + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +8 +6 +7 + + + + + +3 +, +5 +4 +1 + + + + + +4 +, +4 +0 +8 + + + + + + +B +i +g + +B +e +n +d + +( +f +o +r +m +e +r +l +y + +M +a +r +f +a +) + + + + + +2 +, +1 +7 +1 + + + + + +7 +, +4 +6 +6 + + + + + +9 +, +6 +3 +7 + + + + + + +D +e +l + +R +i +o + + + + + +2 +1 +, +3 +9 +9 + + + + + +3 +5 +, +8 +7 +0 + + + + + +5 +7 +, +2 +6 +9 + + + + + + +E +l + +C +e +n +t +r +o + + + + + +6 +, +9 +1 +5 + + + + + +2 +8 +, +2 +2 +3 + + + + + +3 +5 +, +1 +3 +8 + + + + + + +E +l + +P +a +s +o + + + + + +7 +3 +, +1 +9 +2 + + + + + +1 +0 +8 +, +9 +5 +1 + + + + + +1 +8 +2 +, +1 +4 +3 + + + + + + +L +a +r +e +d +o + + + + + +4 +, +9 +0 +7 + + + + + +3 +3 +, +4 +7 +1 + + + + + +3 +8 +, +3 +7 +8 + + + + + + +R +i +o + +G +r +a +n +d +e + +V +a +l +l +e +y + +( +f +o +r +m +e +r +l +y + +M +c +A +l +l +e +n +) + + + + + +1 +3 +7 +, +4 +6 +4 + + + + + +2 +0 +1 +, +6 +7 +1 + + + + + +3 +3 +9 +, +1 +3 +5 + + + + + + +S +a +n + +D +i +e +g +o + + + + + +1 +2 +, +6 +6 +1 + + + + + +4 +5 +, +3 +8 +8 + + + + + +5 +8 +, +0 +4 +9 + + + + + + +T +u +c +s +o +n + + + + + +1 +2 +, +5 +0 +9 + + + + + +5 +0 +, +9 +8 +1 + + + + + +6 +3 +, +4 +9 +0 + + + + + + +Y +u +m +a + + + + + +2 +7 +, +2 +7 +1 + + + + + +4 +0 +, +9 +9 +8 + + + + + +6 +8 +, +2 +6 +9 + + + + + + +S +o +u +t +h +w +e +s +t + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +T +o +t +a +l + + + + + +2 +9 +8 +, +4 +8 +9 + + + + + +5 +5 +3 +, +0 +1 +9 + + + + + +8 +5 +1 +, +5 +0 +8 + + + + + + +N +a +t +i +o +n +w +i +d +e + +T +o +t +a +l + + + + + +2 +9 +9 +, +7 +2 +5 + + + + + +5 +5 +9 +, +7 +7 +6 + + + + + +8 +5 +9 +, +5 +0 +1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +B +O +R +D + +P +A +T +R + + + + + + +U +n +i +t +e +d + +S +t +a +t +e +s + +B +o +r +d +e +r + +P +a +t +r +o +l + + +A +p +p +r +e +h +e +n +s +i +o +n +s + +/ + +S +e +i +z +u +r +e + +S +t +a +t +i +s +t +i +c +s + +- + +F +i +s +c +a +l + +Y +e +a +r + +2 +0 +1 +9 + +( +o +c +t +. + +1 +s +t + +t +h +r +o +u +g +h + +s +e +p +t +. + +3 +0 +t +h +) + + + + + + + + +A +p +p +r +e +h +e +n +s +i +o +n +/ +S +e +i +z +u +r +e + +T +y +p +e + + + + + +C +o +a +s +t +a +l + +B +o +r +d +e +r + +S +e +c +t +o +r +s + + + + + +N +o +r +t +h +e +r +n + +B +o +r +d +e +r + +S +e +c +t +o +r +s + + + + + +S +o +u +t +h +w +e +s +t + +B +o +r +d +e +r + +S +e +c +t +o +r +s + + + + + +N +a +t +i +o +n +w +i +d +e + +T +o +t +a +l + + + + + + +A +p +p +r +e +h +e +n +s +i +o +n +s + + + + + +3 +, +5 +8 +5 + + + + + +4 +, +4 +0 +8 + + + + + +8 +5 +1 +, +5 +0 +8 + + + + + +8 +5 +9 +, +5 +0 +1 + + + + + + +O +t +h +e +r + +T +h +a +n + +M +e +x +i +c +a +n + +A +p +p +r +e +h +e +n +s +i +o +n +s + + + + + +2 +, +6 +0 +8 + + + + + +2 +, +3 +0 +7 + + + + + +6 +8 +5 +, +0 +5 +0 + + + + + +6 +8 +9 +, +9 +6 +5 + + + + + + + + + + + + + + + + + + + + + + + + + + + +M +a +r +i +j +u +a +n +a + +( +p +o +u +n +d +s +) + + + + + +8 +5 +9 + + + + + +3 +, +1 +2 +0 + + + + + +2 +6 +2 +, +9 +0 +3 + + + + + +2 +6 +6 +, +8 +8 +2 + + + + + + +C +o +c +a +i +n +e + +( +p +o +u +n +d +s +) + + + + + +6 +, +8 +4 +3 + + + + + +1 +3 + + + + + +4 +, +8 +2 +6 + + + + + +1 +1 +, +6 +8 +2 + + + + + + +H +e +r +o +i +n + +( +o +u +n +c +e +s +) + + + + + +2 +9 +2 + + + + + +2 +9 +9 + + + + + +1 +2 +, +3 +3 +6 + + + + + +1 +2 +, +9 +2 +7 + + + + + + +M +e +t +h +a +m +p +h +e +t +a +m +i +n +e + +( +p +o +u +n +d +s +) + + + + + +8 +9 +8 + + + + + +4 +0 + + + + + +6 + + + + + +9 +4 +4 + + + + + + +E +c +s +t +a +s +y + +( +p +o +u +n +d +s +) + + + + + +0 + + + + + +0 + + + + + +3 + + + + + +3 + + + + + + +O +t +h +e +r + +D +r +u +g +s +* + +( +p +o +u +n +d +s +) + + + + + +1 + + + + + +1 +5 +8 + + + + + +1 +, +3 +4 +5 + + + + + +1 +, +5 +0 +4 + + + + + + +M +a +r +i +j +u +a +n +a + +S +e +i +z +u +r +e +s + + + + + +1 +0 +7 + + + + + +4 +2 +7 + + + + + +5 +, +1 +7 +6 + + + + + +5 +, +7 +1 +0 + + + + + + +C +o +c +a +i +n +e + +S +e +i +z +u +r +e +s + + + + + +7 +2 + + + + + +3 +4 + + + + + +3 +7 +4 + + + + + +4 +8 +0 + + + + + + +H +e +r +o +i +n + +S +e +i +z +u +r +e +s + + + + + +7 + + + + + +2 +3 + + + + + +2 +1 +6 + + + + + +2 +4 +6 + + + + + + +M +e +t +h +a +m +p +h +e +t +a +m +i +n +e + +S +e +i +z +u +r +e +s + + + + + +6 + + + + + +4 +0 +0 + + + + + +8 +9 +8 + + + + + +1 +, +3 +0 +4 + + + + + + +E +c +s +t +a +s +y + +S +e +i +z +u +r +e +s + + + + + +3 + + + + + +2 + + + + + +4 +6 + + + + + +5 +1 + + + + + + +O +t +h +e +r + +D +r +u +g +s +* + +S +e +i +z +u +r +e +s + + + + + +3 +1 + + + + + +8 +6 + + + + + +6 +0 +3 + + + + + +7 +2 +0 + + + + + + +C +o +n +v +e +y +a +n +c +e +s + + + + + +8 +6 + + + + + +1 +1 +2 + + + + + +9 +, +0 +1 +7 + + + + + +9 +, +2 +1 +5 + + + + + + +F +i +r +e +a +r +m +s + + + + + +1 +4 + + + + + +2 +5 + + + + + +2 +9 +9 + + + + + +3 +3 +8 + + + + + + +A +m +m +u +n +i +t +i +o +n + +( +r +o +u +n +d +s +) + + + + + +4 +3 +1 + + + + + +4 +0 +8 + + + + + +8 +, +8 +8 +2 + + + + + +9 +, +7 +2 +1 + + + + + + +C +u +r +r +e +n +c +y + +( +v +a +l +u +e +) + + + + + +$ +5 +7 +0 +, +1 +4 +5 + + + + + +$ +3 +7 +3 +, +1 +5 +2 + + + + + +$ +5 +, +8 +6 +2 +, +3 +9 +9 + + + + + +$ +6 +, +8 +0 +5 +, +6 +9 +6 + + + + + + +* +O +t +h +e +r + +D +r +u +g +s + +i +n +c +l +u +d +e +: + +A +l +l + +U +S +B +P + +d +r +u +g + +s +e +i +z +u +r +e +s + +e +x +c +l +u +d +i +n +g + +m +a +r +i +j +u +a +n +a +, + +c +o +c +a +i +n +e +, + +h +e +r +o +i +n +, + +m +e +t +h +a +m +p +h +e +t +a +m +i +n +e +, + +a +n +d + +e +c +s +t +a +s +y + +( +M +D +M +A +) +. + + +C +o +a +s +t +a +l + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +i +n +c +l +u +d +e +: + +M +i +a +m +i +, + +N +e +w + +O +r +l +e +a +n +s +, + +a +n +d + +R +a +m +e +y +, + +P +u +e +r +t +o + +R +i +c +o +. + + +N +o +r +t +h +e +r +n + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +i +n +c +l +u +d +e +: + +B +l +a +i +n +e +, + +B +u +f +f +a +l +o +, + +D +e +t +r +o +i +t +, + +G +r +a +n +d + +F +o +r +k +s +, + +H +a +v +r +e +, + +H +o +u +l +t +o +n +, + +S +p +o +k +a +n +e + +a +n +d + +S +w +a +n +t +o +n +. + + +S +o +u +t +h +w +e +s +t + +B +o +r +d +e +r + +S +e +c +t +o +r +s + +i +n +c +l +u +d +e +: + +B +i +g + +B +e +n +d +, + +D +e +l + +R +i +o +, + +E +l + +C +e +n +t +r +o +, + +E +l + +P +a +s +o +, + +L +a +r +e +d +o +, + +R +i +o + +G +r +a +n +d +e + +V +a +l +l +e +y +, + +S +a +n + +D +i +e +g +o +, + +T +u +c +s +o +n +, + +a +n +d + +Y +u +m +a +. + +D +r +u +g + +q +u +a +n +t +i +t +i +e +s + +a +r +e + +r +o +u +n +d +e +d + +t +o + +t +h +e + +n +e +a +r +e +s +t + +w +h +o +l +e + +n +u +m +b +e +r + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/module/exportHocr.spec.js b/tests/module/exportHocr.spec.js index db29085..877de32 100644 --- a/tests/module/exportHocr.spec.js +++ b/tests/module/exportHocr.spec.js @@ -52,7 +52,7 @@ describe('Check .hocr export function.', function () { const ocrAllComp1 = standardizeOCRPages(scribe.data.ocr.active); - const hocrOutStrArr = splitHOCRStr(writeHocr(scribe.data.ocr.active)); + const hocrOutStrArr = splitHOCRStr(writeHocr({ ocrData: scribe.data.ocr.active })); const resArrPromises = hocrOutStrArr.map((x, i) => (gs.schedulerInner.addJob('convertPageHocr', { ocrStr: x, n: i, scribeMode: true }))); const resArr = await Promise.all(resArrPromises); @@ -70,7 +70,7 @@ describe('Check .hocr export function.', function () { const layoutTables1 = structuredClone(scribe.data.layoutDataTables.pages); - const hocrOutStr = writeHocr(scribe.data.ocr.active); + const hocrOutStr = writeHocr({ ocrData: scribe.data.ocr.active }); const encoder = new TextEncoder(); const encoded = encoder.encode(hocrOutStr); diff --git a/tests/module/importAbbyy.spec.js b/tests/module/importAbbyy.spec.js index 30fe3f2..7005a4c 100644 --- a/tests/module/importAbbyy.spec.js +++ b/tests/module/importAbbyy.spec.js @@ -13,7 +13,31 @@ config.truncateThreshold = 0; // Disable truncation for actual/expected values o describe('Check Abbyy XML import function.', function () { this.timeout(10000); - before(async () => { + + it('Should import Abbyy XML with PNG image', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/ascenders_descenders_test.png`, + `${ASSETS_PATH_KARMA}/ascenders_descenders_test_Abbyy.xml`]); + }); + + it('Should correctly import text content from Abbyy XML (default settings)', async () => { + const text1 = scribe.data.ocr.active[0].lines[0].words.map((x) => x.text).join(' '); + const text2 = scribe.data.ocr.active[0].lines[1].words.map((x) => x.text).join(' '); + const text3 = scribe.data.ocr.active[0].lines[2].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1, 'Ascenders On'); + assert.strictEqual(text2, 'query png'); + assert.strictEqual(text3, 'we can'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check Abbyy XML import function.', function () { + this.timeout(10000); + + it('Should import Abbyy XML without image/PDF inputs', async () => { await scribe.importFiles([`${ASSETS_PATH_KARMA}/econometrica_example_abbyy.xml`]); }); @@ -138,3 +162,29 @@ describe('Check that font style is detected for Abbyy xml imports.', function () await scribe.terminate(); }); }).timeout(120000); + +describe('Check Abbyy XML table import.', function () { + this.timeout(20000); + + it('Should import Abbyy XML with PDF document', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`, + `${ASSETS_PATH_KARMA}/border_patrol_tables_Abbyy.xml`]); + + assert.isTrue(scribe.data.ocr.active[0].lines.length > 0); + }).timeout(20000); + + it('Should correctly import table structures from Abbyy XML', async () => { + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables.length === 1); + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables[0].boxes.length === 10); + }).timeout(10000); + + it('Should correctly import table structures from Abbyy XML', async () => { + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables.length === 1); + + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables[0].boxes.length === 10); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); diff --git a/tests/module/importAwsTextract.spec.js b/tests/module/importAwsTextract.spec.js new file mode 100644 index 0000000..427fa0e --- /dev/null +++ b/tests/module/importAwsTextract.spec.js @@ -0,0 +1,83 @@ +// Relative imports are required to run in browser. +/* eslint-disable import/no-relative-packages */ +import { assert, config } from '../../node_modules/chai/chai.js'; +// import mocha from '../../node_modules/mocha/mocha.js'; +import scribe from '../../scribe.js'; +import { ASSETS_PATH_KARMA } from '../constants.js'; + +config.truncateThreshold = 0; // Disable truncation for actual/expected values on assertion failure. + +// Using arrow functions breaks references to `this`. +/* eslint-disable prefer-arrow-callback */ +/* eslint-disable func-names */ + +describe('Check AWS Textract JSON import function.', function () { + this.timeout(10000); + + it('Should import AWS Textract with PNG image', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/ascenders_descenders_test.png`, + `${ASSETS_PATH_KARMA}/ascenders_descenders_test_AwsTextract.json`]); + }); + + it('Should correctly import text content from AWS Textract (default settings)', async () => { + const text1 = scribe.data.ocr.active[0].lines[0].words.map((x) => x.text).join(' '); + const text2 = scribe.data.ocr.active[0].lines[1].words.map((x) => x.text).join(' '); + const text3 = scribe.data.ocr.active[0].lines[2].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1, 'Ascenders On'); + assert.strictEqual(text2, 'query png'); + assert.strictEqual(text3, 'we can'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check AWS Textract JSON import function (layout analysis enabled).', function () { + this.timeout(10000); + before(async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/ascenders_descenders_test.png`, + `${ASSETS_PATH_KARMA}/ascenders_descenders_test_AwsTextractLayout.json`]); + }); + + it('Should correctly import text content from AWS Textract (default settings)', async () => { + const text1 = scribe.data.ocr.active[0].lines[0].words.map((x) => x.text).join(' '); + const text2 = scribe.data.ocr.active[0].lines[1].words.map((x) => x.text).join(' '); + const text3 = scribe.data.ocr.active[0].lines[2].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1, 'Ascenders On'); + assert.strictEqual(text2, 'query png'); + assert.strictEqual(text3, 'we can'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); + +describe('Check AWS Textract table import.', function () { + this.timeout(10000); + + it('Should import AWS Textract with PDF document', async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/border_patrol_tables.pdf`, + `${ASSETS_PATH_KARMA}/border_patrol_tables_analyzeDocResponse.json`]); + + assert.isTrue(scribe.data.ocr.active[0].lines.length > 0); + }).timeout(10000); + + it('Should correctly import table structures from AWS Textract', async () => { + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables.length === 1); + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables[0].boxes.length === 10); + }).timeout(10000); + + it('Should correctly import table structures from AWS Textract', async () => { + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables.length === 1); + + assert.isTrue(scribe.data.layoutDataTables.pages[0].tables[0].boxes.length === 10); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000); diff --git a/tests/module/importGoogleVision.spec.js b/tests/module/importGoogleVision.spec.js new file mode 100644 index 0000000..bf3b75a --- /dev/null +++ b/tests/module/importGoogleVision.spec.js @@ -0,0 +1,33 @@ +// Relative imports are required to run in browser. +/* eslint-disable import/no-relative-packages */ +import { assert, config } from '../../node_modules/chai/chai.js'; +// import mocha from '../../node_modules/mocha/mocha.js'; +import scribe from '../../scribe.js'; +import { ASSETS_PATH_KARMA } from '../constants.js'; + +config.truncateThreshold = 0; // Disable truncation for actual/expected values on assertion failure. + +// Using arrow functions breaks references to `this`. +/* eslint-disable prefer-arrow-callback */ +/* eslint-disable func-names */ + +describe('Check Google Vision JSON import function.', function () { + this.timeout(10000); + before(async () => { + await scribe.importFiles([`${ASSETS_PATH_KARMA}/ascenders_descenders_test_GoogleVision.json`]); + }); + + it('Should correctly import text content from AWS Textract (default settings)', async () => { + const text1 = scribe.data.ocr.active[0].lines[0].words.map((x) => x.text).join(' '); + const text2 = scribe.data.ocr.active[0].lines[1].words.map((x) => x.text).join(' '); + const text3 = scribe.data.ocr.active[0].lines[2].words.map((x) => x.text).join(' '); + + assert.strictEqual(text1, 'Ascenders On'); + assert.strictEqual(text2, 'query png'); + assert.strictEqual(text3, 'we can'); + }).timeout(10000); + + after(async () => { + await scribe.terminate(); + }); +}).timeout(120000);