Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ async function main(func, params) {
const debugDir = `${outputDir}/${outputStem}_debug`;
fs.mkdirSync(debugDir, { recursive: true });
const outputPathCsv = `${debugDir}/_debug.csv`;
scribe.utils.writeDebugCsv(scribe.data.ocr.active, outputPathCsv);
scribe.utils.writeDebugCsv({ pages: scribe.data.ocr.active, fileName: outputPathCsv });

scribe.utils.dumpDebugImages(debugDir);
scribe.utils.dumpHOCR(debugDir);
Expand Down
4 changes: 2 additions & 2 deletions js/clear.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import {
layoutRegions,
ocrAll,
ocrAllRaw,
pageMetricsArr,
pageMetricsAll,
} from './containers/dataContainer.js';
import { FontCont } from './containers/fontContainer.js';
import { ImageCache } from './containers/imageContainer.js';
Expand All @@ -19,7 +19,7 @@ export function clearData() {
ocrAllRaw.active = [];
layoutRegions.pages.length = 0;
layoutDataTables.pages.length = 0;
pageMetricsArr.length = 0;
pageMetricsAll.length = 0;
convertPageWarn.length = 0;
ImageCache.clear();
FontCont.clear();
Expand Down
2 changes: 1 addition & 1 deletion js/containers/dataContainer.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ export const ocrAll = { active: [] };
export const ocrAllRaw = { active: [] };

/** @type {Array<PageMetrics>} */
export const pageMetricsArr = [];
export const pageMetricsAll = [];

/**
* Class that stores various debug data.
Expand Down
44 changes: 9 additions & 35 deletions js/containers/imageContainer.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ import {
import { initMuPDFWorker } from '../../mupdf/mupdf-async.js';

import { updateFontContWorkerMain } from '../fontContainerMain.js';
import { pageMetricsArr } from './dataContainer.js';
import { pageMetricsAll } from './dataContainer.js';
import {
FontCont,
FontContainerFont,
loadOpentype,
} from './fontContainer.js';

import { gs } from '../generalWorkerMain.js';
import { imageUtils } from '../objects/imageObjects.js';
import { imageUtils, ImageWrapper } from '../objects/imageObjects.js';
import { range } from '../utils/miscUtils.js';
import { opt } from './app.js';

Expand Down Expand Up @@ -42,32 +42,6 @@ export class MuPDFScheduler {
}
}

export class ImageWrapper {
/**
* @param {number} n - Page number
* @param {string} imageStr - Base-64 encoded image string. Should start with "data:image/png" or "data:image/jpeg".
* @param {string} colorMode - Color mode ("color", "gray", or "binary").
* @param {boolean} rotated - Whether image has been rotated.
* @param {boolean} upscaled - Whether image has been upscaled.
*
* All properties of this object must be serializable, as ImageWrapper objects are sent between threads.
* This means that no promises can be used.
*/
constructor(n, imageStr, colorMode, rotated = false, upscaled = false) {
this.n = n;
this.src = imageStr;
const format0 = imageStr.match(/^data:image\/(png|jpeg)/)?.[1];
if (!format0 || !['png', 'jpeg'].includes(format0)) throw new Error(`Invalid image format: ${format0}`);
this.format = format0;
this._dims = null;
this.rotated = rotated;
this.upscaled = upscaled;
this.colorMode = colorMode;
/** @type {?ImageBitmap} */
this.imageBitmap = null;
}
}

/**
* @typedef {Object} ImageProperties
* @property {boolean} [rotated]
Expand Down Expand Up @@ -126,7 +100,7 @@ export class ImageCache {
colorMode = color ? 'color' : 'gray';
}

let pageAngle = pageMetricsArr[n].angle || 0;
let pageAngle = pageMetricsAll[n].angle || 0;
if (Math.abs(pageAngle) < 0.05) pageAngle = 0;

// If no preference is specified for rotation, default to true.
Expand Down Expand Up @@ -213,7 +187,7 @@ export class ImageCache {
if (ImageCache.inputModes.image) {
return ImageCache.nativeSrc[n];
} if (ImageCache.inputModes.pdf) {
const pageMetrics = pageMetricsArr[n];
const pageMetrics = pageMetricsAll[n];
const targetWidth = pageMetrics.dims.width;
const dpi = 300 * (targetWidth / ImageCache.pdfDims300[n].width);
const muPDFScheduler = await ImageCache.getMuPDFScheduler();
Expand All @@ -232,7 +206,7 @@ export class ImageCache {
* @param {boolean} [saveNativeImage=true] - Whether the native image should be saved.
*/
static transformImage = async (inputImage, n, props, saveNativeImage = true) => {
let pageAngle = pageMetricsArr[n].angle || 0;
let pageAngle = pageMetricsAll[n].angle || 0;
if (Math.abs(pageAngle) < 0.05) pageAngle = 0;

// If no preference is specified for rotation, default to true.
Expand All @@ -245,8 +219,8 @@ export class ImageCache {
await gs.getGeneralScheduler();

const resPromise = (async () => {
// Wait for non-rotated version before replacing with promise
if (typeof process === 'undefined') await gs.initTesseract({ anyOk: true });
// Wait for non-rotated version before replacing with promise
await gs.initTesseract({ anyOk: true });
return gs.recognize({
image: inputImage.src,
options: { rotateRadians: angleArg, upscale: upscaleArg },
Expand Down Expand Up @@ -280,7 +254,7 @@ export class ImageCache {
return { native: undefined, binary: undefined };
}

const significantRotation = Math.abs(pageMetricsArr[n].angle || 0) > 0.05;
const significantRotation = Math.abs(pageMetricsAll[n].angle || 0) > 0.05;

const newNative = !ImageCache.native[n] || !imageUtils.compatible(ImageCache.nativeProps[n], props, significantRotation);
const newBinary = !nativeOnly && (!ImageCache.binary[n] || !imageUtils.compatible(ImageCache.binaryProps[n], props, significantRotation));
Expand Down Expand Up @@ -426,7 +400,7 @@ export class ImageCache {

ImageCache.pdfDims300.forEach((x, i) => {
const pageDims = { width: Math.round(x.width * pageDPI[i] / 300), height: Math.round(x.height * pageDPI[i] / 300) };
pageMetricsArr[i] = new PageMetrics(pageDims);
pageMetricsAll[i] = new PageMetrics(pageDims);
});

// WIP: Extract fonts embedded in PDFs.
Expand Down
6 changes: 3 additions & 3 deletions js/coordinates.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// Image Coordinate Space: coordinate space of a particular image
// Canvas Coordinate Space: coordinate space of canvas, used for user interactions

import { pageMetricsArr } from './containers/dataContainer.js';
import { pageMetricsAll } from './containers/dataContainer.js';
import { ImageCache } from './containers/imageContainer.js';

/**
Expand All @@ -27,7 +27,7 @@ function rotateBoundingBox(boundingBox, rotateAngle, n) {
let angleAdjXRect = 0;
let angleAdjYRect = 0;

const pageDims = pageMetricsArr[n].dims;
const pageDims = pageMetricsAll[n].dims;

const sinAngle = Math.sin(rotateAngle * (Math.PI / 180));
const cosAngle = Math.cos(rotateAngle * (Math.PI / 180));
Expand Down Expand Up @@ -103,7 +103,7 @@ async function ocrToImage(ocrCoords, n, binary = false) {

if (imageN.rotated) {
// Otherwise, we must also account for rotation applied by the canvas
const rotateAngle = (pageMetricsArr[n].angle || 0) * -1;
const rotateAngle = (pageMetricsAll[n].angle || 0) * -1;

rotateBoundingBox(ocrCoords, rotateAngle, n);
}
Expand Down
4 changes: 2 additions & 2 deletions js/debug.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { opt } from './containers/app.js';
import { pageMetricsArr } from './containers/dataContainer.js';
import { pageMetricsAll } from './containers/dataContainer.js';
import { ImageCache } from './containers/imageContainer.js';
import { gs } from './generalWorkerMain.js';
import { loadImageElem } from './utils/imageUtils.js';
Expand Down Expand Up @@ -125,7 +125,7 @@ export async function renderPageStatic(page) {
const res = gs.renderPageStaticImp({
page,
image,
angle: pageMetricsArr[page.n].angle,
angle: pageMetricsAll[page.n].angle,
});

return res;
Expand Down
70 changes: 56 additions & 14 deletions js/export/export.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import { inputData, opt } from '../containers/app.js';
import {
layoutDataTables, layoutRegions, ocrAll, pageMetricsArr,
layoutDataTables, layoutRegions, ocrAll, pageMetricsAll,
} from '../containers/dataContainer.js';
import { ImageCache } from '../containers/imageContainer.js';
import { reorderOcrPage } from '../modifyOCR.js';
import { saveAs } from '../utils/miscUtils.js';
import { writePdf } from './writePdf.js';
import { writePdf } from './pdf/writePdf.js';
import { writeHocr } from './writeHocr.js';
import { writeText } from './writeText.js';
import { writeHtml } from './writeHtml.js';
Expand Down Expand Up @@ -45,8 +45,8 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
const dimsLimit = { width: -1, height: -1 };
if (opt.standardizePageSize) {
for (let i = minPage; i <= maxPage; i++) {
dimsLimit.height = Math.max(dimsLimit.height, pageMetricsArr[i].dims.height);
dimsLimit.width = Math.max(dimsLimit.width, pageMetricsArr[i].dims.width);
dimsLimit.height = Math.max(dimsLimit.height, pageMetricsAll[i].dims.height);
dimsLimit.width = Math.max(dimsLimit.width, pageMetricsAll[i].dims.width);
}
}

Expand All @@ -58,10 +58,30 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {

const rotateText = !rotateBackground;

const includeImages = false;
/** @type {ImageWrapper[]} */
let images = [];
if (includeImages) {
images = await Promise.all(ImageCache.nativeSrc);
}

// Page sizes should not be standardized at this step, as the overlayText/overlayTextImage functions will perform this,
// and assume that the overlay PDF is the same size as the input images.
const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, rotateText, rotateBackground,
{ width: -1, height: -1 }, opt.confThreshHigh, opt.confThreshMed, opt.overlayOpacity / 100);
const pdfStr = await writePdf({
ocrArr: ocrDownload,
pageMetricsArr: pageMetricsAll,
minpage: minPage,
maxpage: maxPage,
textMode: opt.displayMode,
rotateText,
rotateBackground,
dimsLimit: { width: -1, height: -1 },
confThreshHigh: opt.confThreshHigh,
confThreshMed: opt.confThreshMed,
proofOpacity: opt.overlayOpacity / 100,
images,
includeImages,
});

const enc = new TextEncoder();
const pdfEnc = enc.encode(pdfStr);
Expand Down Expand Up @@ -121,7 +141,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {

await w.convertImageStart({ humanReadable: opt.humanReadablePDF });
for (let i = minPage; i < maxPage + 1; i++) {
/** @type {import('../containers/imageContainer.js').ImageWrapper} */
/** @type {ImageWrapper} */
let image;
if (binary) {
image = await ImageCache.getBinary(i, props);
Expand All @@ -134,7 +154,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
// Angle the PDF viewer is instructed to rotated the image by.
// This method is currently only used when rotation is needed but the user's (unrotated) source images are being used.
// If the images are being rendered, then rotation is expected to be applied within the rendering process.
const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsArr[i].angle || 0) * -1 : 0;
const angleImagePdf = rotateBackground && !renderImage ? (pageMetricsAll[i].angle || 0) * -1 : 0;

await w.convertImageAddPage({
image: image.src, i, pagewidth: dimsLimit.width, pageheight: dimsLimit.height, angle: angleImagePdf,
Expand All @@ -157,8 +177,19 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {

w.freeDocument(pdfOverlay);
} else {
const pdfStr = await writePdf(ocrDownload, minPage, maxPage, opt.displayMode, false, true, dimsLimit, opt.confThreshHigh, opt.confThreshMed,
opt.overlayOpacity / 100);
const pdfStr = await writePdf({
ocrArr: ocrDownload,
pageMetricsArr: pageMetricsAll,
minpage: minPage,
maxpage: maxPage,
textMode: opt.displayMode,
rotateText: false,
rotateBackground: true,
dimsLimit,
confThreshHigh: opt.confThreshHigh,
confThreshMed: opt.confThreshMed,
proofOpacity: opt.overlayOpacity / 100,
});

// The PDF is still run through muPDF, even thought in eBook mode no background layer is added.
// This is because muPDF cleans up the PDF we made in the previous step, including:
Expand Down Expand Up @@ -186,7 +217,7 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
w.freeDocument(pdf);
}
} else if (format === 'hocr') {
content = writeHocr(ocrDownload, minPage, maxPage);
content = writeHocr({ ocrData: ocrDownload, minValue: minPage, maxValue: maxPage });
} else if (format === 'html') {
const images = /** @type {Array<ImageWrapper>} */ ([]);
if (opt.includeImages) {
Expand Down Expand Up @@ -218,18 +249,29 @@ export async function exportData(format = 'txt', minPage = 0, maxPage = -1) {
ocrPages: ocrDownload, images, minpage: minPage, maxpage: maxPage, reflowText: opt.reflow, removeMargins: opt.removeMargins,
});
} else if (format === 'txt') {
content = writeText(ocrDownload, minPage, maxPage, opt.reflow, false);
content = writeText({
ocrCurrent: ocrDownload,
minpage: minPage,
maxpage: maxPage,
reflowText: opt.reflow,
docxMode: false,
});
// Defining `DISABLE_DOCX_XLSX` disables docx/xlsx exports when using build tools.
// @ts-ignore
} else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'docx') {
// Less common export formats are loaded dynamically to reduce initial load time.
const writeDocx = (await import('./writeDocx.js')).writeDocx;
content = await writeDocx(ocrDownload, minPage, maxPage);
content = await writeDocx({ hocrCurrent: ocrDownload, minpage: minPage, maxpage: maxPage });
// @ts-ignore
} else if (typeof DISABLE_DOCX_XLSX === 'undefined' && format === 'xlsx') {
// Less common export formats are loaded dynamically to reduce initial load time.
const writeXlsx = (await import('./writeTabular.js')).writeXlsx;
content = await writeXlsx(ocrDownload, layoutDataTables.pages, minPage, maxPage);
content = await writeXlsx({
ocrPageArr: ocrDownload,
layoutPageArr: layoutDataTables.pages,
minpage: minPage,
maxpage: maxPage,
});
} else if (format === 'scribe') {
const data = {
ocr: removeCircularRefsOcr(ocrDownload),
Expand Down
7 changes: 4 additions & 3 deletions js/export/exportDebugCsv.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ export const convertToCsv = (data) => {

/**
*
* @param {Array<OcrPage>} pages
* @param {string} fileName
* @param {Object} params
* @param {Array<OcrPage>} params.pages
* @param {string} params.fileName
* @returns
*/
export const writeDebugCsv = (pages, fileName) => {
export const writeDebugCsv = ({ pages, fileName }) => {
let csvStr = '';

for (let i = 0; i < pages.length; i++) {
Expand Down
Loading
Loading