Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dev/createRotationTextPdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { imageUtils } from '../js/objects/imageObjects.js';

await scribe.init({ font: true });
const images = await importImageFilesP([
'./tests/assets/econometrica_example.png',
'./tests/assets/testocr.png',
]);

const pageMetricsImages = images.map((image) => {
Expand All @@ -33,10 +33,11 @@ const pdfStr = await writePdf({
pageMetricsArr: pageMetricsImages,
includeImages: true,
rotateBackground: true,
rotateOrientation: true,
});

const enc = new TextEncoder();
const pdfEnc = enc.encode(pdfStr);

await writeFile('rotation_text_test.pdf', pdfEnc);
await writeFile('./tests/assets/testocr_all_orientations.pdf', pdfEnc);
await scribe.terminate();
22 changes: 21 additions & 1 deletion js/export/pdf/writePdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ const createPdfFontRefs = async (ocrArr) => {
* @param {("ebook"|"eval"|"proof"|"invis")} [params.textMode="ebook"] -
* @param {boolean} [params.rotateText=false] -
* @param {boolean} [params.rotateBackground=false] -
* @param {boolean} [params.rotateOrientation=false] - If true, canvas is adjusted to flip width/height to account for image rotation
* of 90 or 270 degrees. This argument is currently only used in a dev script and may not be the best approach.
* @param {dims} [params.dimsLimit] -
* @param {number} [params.confThreshHigh=85] -
* @param {number} [params.confThreshMed=75] -
Expand All @@ -131,6 +133,7 @@ export async function writePdf({
textMode = 'ebook',
rotateText = false,
rotateBackground = false,
rotateOrientation = false,
dimsLimit = { width: -1, height: -1 },
confThreshHigh = 85,
confThreshMed = 75,
Expand Down Expand Up @@ -209,6 +212,7 @@ export async function writePdf({
pdfFonts,
textMode,
angle,
rotateOrientation,
rotateText,
rotateBackground,
confThreshHigh,
Expand Down Expand Up @@ -336,6 +340,8 @@ ${xrefOffset}
* @param {Object<string, PdfFontFamily>} params.pdfFonts
* @param {("ebook"|"eval"|"proof"|"invis")} params.textMode -
* @param {number} params.angle
* @param {boolean} [params.rotateOrientation=false] - If true, canvas is adjusted to flip width/height to account for image rotation
* of 90 or 270 degrees. This argument is currently only used in a dev script and may not be the best approach.
* @param {boolean} [params.rotateText=false]
* @param {boolean} [params.rotateBackground=false]
* @param {number} [params.confThreshHigh=85]
Expand All @@ -354,6 +360,7 @@ async function ocrPageToPDF({
pdfFonts,
textMode,
angle,
rotateOrientation = false,
rotateText = false,
rotateBackground = false,
confThreshHigh = 85,
Expand All @@ -370,6 +377,11 @@ async function ocrPageToPDF({

const pageIndex = firstObjIndex;
let pageObjStr = `${String(pageIndex)} 0 obj\n<</Type/Page/MediaBox[0 0 ${String(outputDims.width)} ${String(outputDims.height)}]`;

if (rotateOrientation && (angle > 45 && angle < 135 || angle > 225 && angle < 315)) {
pageObjStr = `${String(pageIndex)} 0 obj\n<</Type/Page/MediaBox[0 0 ${String(outputDims.height)} ${String(outputDims.width)}]`;
}

pageObjStr += `/Parent ${parentIndex} 0 R`;

if (noTextContent && noImageContent) {
Expand All @@ -393,7 +405,15 @@ async function ocrPageToPDF({
if (rotateBackground && Math.abs(angle ?? 0) > 0.05) {
rotation = angle;
}
imageContentObjStr += drawImageCommands(imageName, 0, 0, outputDims.width, outputDims.height, rotation);

let x = 0;
let y = 0;
if (rotateOrientation && (rotation > 45 && rotation < 135 || rotation > 225 && rotation < 315)) {
x = (outputDims.height - outputDims.width) / 2;
y = (outputDims.width - outputDims.height) / 2;
}

imageContentObjStr += drawImageCommands(imageName, x, y, outputDims.width, outputDims.height, rotation);
}

if (noTextContent) {
Expand Down
10 changes: 10 additions & 0 deletions js/generalWorkerMain.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ export async function initGeneralWorker() {
obj.convertPageAbbyy = wrap('convertPageAbbyy');
obj.convertPageStext = wrap('convertPageStext');
obj.convertDocTextract = wrap('convertDocTextract');
obj.convertDocAzureDocIntel = wrap('convertDocAzureDocIntel');
obj.convertPageGoogleVision = wrap('convertPageGoogleVision');
obj.convertPageText = wrap('convertPageText');

Expand Down Expand Up @@ -192,6 +193,15 @@ export class gs {
return gs.schedulerInner.addJob('convertDocTextract', args);
};

/**
* @param {Parameters<typeof import('./import/convertDocAzureDocIntel.js').convertDocAzureDocIntel>[0]} args
* @returns {ReturnType<typeof import('./import/convertDocAzureDocIntel.js').convertDocAzureDocIntel>}
*/
static convertDocAzureDocIntel = async (args) => {
await gs.getGeneralScheduler();
return gs.schedulerInner.addJob('convertDocAzureDocIntel', args);
};

/**
* @param {Parameters<typeof import('./import/convertPageGoogleVision.js').convertPageGoogleVision>[0]} args
* @returns {ReturnType<typeof import('./import/convertPageGoogleVision.js').convertPageGoogleVision>}
Expand Down
58 changes: 57 additions & 1 deletion js/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ declare global {

// Strings representing supported sources of text.
// `stext` indicates the text was extracted directly from a PDF using mupdf.
type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text';
type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel';

type FontState = {
enableOpt: boolean;
Expand Down Expand Up @@ -330,6 +330,62 @@ declare global {
text: string;
}

// Azure Document Intelligence types
interface AzureDocIntelPoint {
x: number;
y: number;
}

interface AzureDocIntelSpan {
offset: number;
length: number;
}

interface AzureDocIntelWord {
content: string;
polygon: AzureDocIntelPoint[];
span: AzureDocIntelSpan;
confidence: number;
}

interface AzureDocIntelLine {
content: string;
polygon: AzureDocIntelPoint[];
spans: AzureDocIntelSpan[];
}

interface AzureDocIntelStyle {
isHandwritten?: boolean;
spans: AzureDocIntelSpan[];
confidence: number;
}

interface AzureDocIntelPage {
pageNumber: number;
angle: number;
width: number;
height: number;
unit: 'pixel' | 'inch';
words: AzureDocIntelWord[];
lines: AzureDocIntelLine[];
spans: AzureDocIntelSpan[];
}

interface AzureDocIntelAnalyzeResult {
apiVersion: string;
modelId: string;
content: string;
pages: AzureDocIntelPage[];
styles: AzureDocIntelStyle[];
}

interface AzureDocIntelResponse {
status: 'succeeded' | 'failed' | 'running';
createdDateTime: string;
lastUpdatedDateTime: string;
analyzeResult: AzureDocIntelAnalyzeResult;
}

}

export { };
Expand Down
140 changes: 140 additions & 0 deletions js/import/convertDocAzureDocIntel.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import ocr from '../objects/ocrObjects.js';

import { LayoutDataTablePage } from '../objects/layoutObjects.js';
import { pass2, pass3 } from './convertPageShared.js';

// NOTE: This is a WIP and incomplete.
// The Azure Document Intelligence format is not yet supported.

const debugMode = false;

/**
* @param {Object} params
* @param {string} params.ocrStr
* @param {dims[]} params.pageDims - Page metrics to use for the pages
*/
export async function convertDocAzureDocIntel({ ocrStr, pageDims }) {
let ocrData;
try {
ocrData = JSON.parse(ocrStr);
} catch (error) {
throw new Error('Failed to parse Azure Document Intelligence JSON data.');
}

if (!ocrData.analyzeResult || !ocrData.analyzeResult.pages || !ocrData.analyzeResult.pages[0]) {
throw new Error('Invalid Azure Document Intelligence format: missing pages data.');
}

const analyzeResultPages = /** @type {AzureDocIntelPage[]} */ (ocrData.analyzeResult.pages);

const resArr = [];

for (let n = 0; n < analyzeResultPages.length; n++) {
const pageData = analyzeResultPages[n];
const pageDimsN = pageDims[n];

if (!pageData.width || !pageData.height) {
throw new Error('Failed to parse page dimensions.');
}

const pageObj = new ocr.OcrPage(n, pageDimsN);
pageObj.textSource = 'azure_doc_intel';

if (!pageData.words || pageData.words.length === 0) {
const warn = { char: 'char_error' };
resArr.push({
pageObj, charMetricsObj: {}, dataTables: new LayoutDataTablePage(n), warn,
});
}

if (pageData.unit !== 'pixel') {
if (!pageDimsN || !pageDimsN.width || !pageDimsN.height) {
throw new Error('Page dimensions must be provided for non-pixel units.');
}

const pageDimsMult = {
width: pageDimsN.width / pageData.width,
height: pageDimsN.height / pageData.height,
};

pageData.lines.forEach((line) => {
line.polygon = line.polygon.map((val, idx) => (idx % 2 === 0 ? val * pageDimsMult.width : val * pageDimsMult.height));
});

pageData.words.forEach((word) => {
word.polygon = word.polygon.map((val, idx) => (idx % 2 === 0 ? val * pageDimsMult.width : val * pageDimsMult.height));
});
}

for (let i = 0; i < pageData.lines.length; i++) {
const lineWordsInput = /** @type {AzureDocIntelWord[]} */ ([]);
for (let j = 0; j < pageData.lines[i].spans.length; j++) {
const span = pageData.lines[i].spans[j];
for (let k = 0; k < pageData.words.length; k++) {
const wordSpan = pageData.words[k].span;
if (wordSpan.offset >= span.offset && (wordSpan.offset + wordSpan.length) <= (span.offset + span.length)) {
lineWordsInput.push(pageData.words[k]);
}
}
}

if (lineWordsInput.length === 0) continue;

const allX = lineWordsInput.flatMap((w) => w.polygon.filter((_, i) => i % 2 === 0));
const allY = lineWordsInput.flatMap((w) => w.polygon.filter((_, i) => i % 2 === 1));

const lineBbox = {
left: Math.min(...allX),
top: Math.min(...allY),
right: Math.max(...allX),
bottom: Math.max(...allY),
};

const baseline = [0, 0];

const lineObj = new ocr.OcrLine(pageObj, lineBbox, baseline);
if (debugMode) lineObj.raw = JSON.stringify(lineWordsInput);

for (let j = 0; j < lineWordsInput.length; j++) {
const wordData = lineWordsInput[j];

if (!wordData.content || wordData.content.trim() === '') continue;

const wordX = wordData.polygon.filter((_, i) => i % 2 === 0);
const wordY = wordData.polygon.filter((_, i) => i % 2 === 1);

const wordBbox = {
left: Math.min(...wordX),
top: Math.min(...wordY),
right: Math.max(...wordX),
bottom: Math.max(...wordY),
};

const wordId = `word_${n + 1}_${pageObj.lines.length + 1}_${j + 1}`;
const wordObj = new ocr.OcrWord(lineObj, wordId, wordData.content, wordBbox);

wordObj.conf = Math.round((wordData.confidence || 0) * 100);

if (debugMode) wordObj.raw = JSON.stringify(wordData);

lineObj.words.push(wordObj);
}

if (lineObj.words.length > 0) {
pageObj.lines.push(lineObj);
}
}

const pageAngle = pageData.angle || 0;
pageObj.angle = pageAngle;

// pass2(pageObj, 0);
const langSet = pass3(pageObj);

const dataTables = new LayoutDataTablePage(n);

resArr.push({ pageObj, dataTables, langSet });
}

return resArr;
}
Loading