scribeocr · Balearica · Nov 1, 2025 · Nov 1, 2025
diff --git a/cloud-adapters b/cloud-adapters
diff --git a/dev/createRotationTextPdf.js b/dev/createRotationTextPdf.js
@@ -7,7 +7,7 @@ import { imageUtils } from '../js/objects/imageObjects.js';
 
 await scribe.init({ font: true });
 const images = await importImageFilesP([
-  './tests/assets/econometrica_example.png',
+  './tests/assets/testocr.png',
 ]);
 
 const pageMetricsImages = images.map((image) => {
@@ -33,10 +33,11 @@ const pdfStr = await writePdf({
   pageMetricsArr: pageMetricsImages,
   includeImages: true,
   rotateBackground: true,
+  rotateOrientation: true,
 });
 
 const enc = new TextEncoder();
 const pdfEnc = enc.encode(pdfStr);
 
-await writeFile('rotation_text_test.pdf', pdfEnc);
+await writeFile('./tests/assets/testocr_all_orientations.pdf', pdfEnc);
 await scribe.terminate();
diff --git a/js/export/pdf/writePdf.js b/js/export/pdf/writePdf.js
@@ -114,6 +114,8 @@ const createPdfFontRefs = async (ocrArr) => {
  * @param {("ebook"|"eval"|"proof"|"invis")} [params.textMode="ebook"] -
  * @param {boolean} [params.rotateText=false] -
  * @param {boolean} [params.rotateBackground=false] -
+ * @param {boolean} [params.rotateOrientation=false] - If true, canvas is adjusted to flip width/height to account for image rotation
+ *    of 90 or 270 degrees. This argument is currently only used in a dev script and may not be the best approach.
  * @param {dims} [params.dimsLimit] -
  * @param {number} [params.confThreshHigh=85] -
  * @param {number} [params.confThreshMed=75] -
@@ -131,6 +133,7 @@ export async function writePdf({
   textMode = 'ebook',
   rotateText = false,
   rotateBackground = false,
+  rotateOrientation = false,
   dimsLimit = { width: -1, height: -1 },
   confThreshHigh = 85,
   confThreshMed = 75,
@@ -209,6 +212,7 @@ export async function writePdf({
       pdfFonts,
       textMode,
       angle,
+      rotateOrientation,
       rotateText,
       rotateBackground,
       confThreshHigh,
@@ -336,6 +340,8 @@ ${xrefOffset}
  * @param {Object<string, PdfFontFamily>} params.pdfFonts
  * @param {("ebook"|"eval"|"proof"|"invis")} params.textMode -
  * @param {number} params.angle
+ * @param {boolean} [params.rotateOrientation=false] - If true, canvas is adjusted to flip width/height to account for image rotation
+ *    of 90 or 270 degrees. This argument is currently only used in a dev script and may not be the best approach.
  * @param {boolean} [params.rotateText=false]
  * @param {boolean} [params.rotateBackground=false]
  * @param {number} [params.confThreshHigh=85]
@@ -354,6 +360,7 @@ async function ocrPageToPDF({
   pdfFonts,
   textMode,
   angle,
+  rotateOrientation = false,
   rotateText = false,
   rotateBackground = false,
   confThreshHigh = 85,
@@ -370,6 +377,11 @@ async function ocrPageToPDF({
 
   const pageIndex = firstObjIndex;
   let pageObjStr = `${String(pageIndex)} 0 obj\n<</Type/Page/MediaBox[0 0 ${String(outputDims.width)} ${String(outputDims.height)}]`;
+
+  if (rotateOrientation && (angle > 45 && angle < 135 || angle > 225 && angle < 315)) {
+    pageObjStr = `${String(pageIndex)} 0 obj\n<</Type/Page/MediaBox[0 0 ${String(outputDims.height)} ${String(outputDims.width)}]`;
+  }
+
   pageObjStr += `/Parent ${parentIndex} 0 R`;
 
   if (noTextContent && noImageContent) {
@@ -393,7 +405,15 @@ async function ocrPageToPDF({
     if (rotateBackground && Math.abs(angle ?? 0) > 0.05) {
       rotation = angle;
     }
-    imageContentObjStr += drawImageCommands(imageName, 0, 0, outputDims.width, outputDims.height, rotation);
+
+    let x = 0;
+    let y = 0;
+    if (rotateOrientation && (rotation > 45 && rotation < 135 || rotation > 225 && rotation < 315)) {
+      x = (outputDims.height - outputDims.width) / 2;
+      y = (outputDims.width - outputDims.height) / 2;
+    }
+
+    imageContentObjStr += drawImageCommands(imageName, x, y, outputDims.width, outputDims.height, rotation);
   }
 
   if (noTextContent) {

diff --git a/js/generalWorkerMain.js b/js/generalWorkerMain.js
@@ -96,6 +96,7 @@ export async function initGeneralWorker() {
     obj.convertPageAbbyy = wrap('convertPageAbbyy');
     obj.convertPageStext = wrap('convertPageStext');
     obj.convertDocTextract = wrap('convertDocTextract');
+    obj.convertDocAzureDocIntel = wrap('convertDocAzureDocIntel');
     obj.convertPageGoogleVision = wrap('convertPageGoogleVision');
     obj.convertPageText = wrap('convertPageText');
 
@@ -192,6 +193,15 @@ export class gs {
     return gs.schedulerInner.addJob('convertDocTextract', args);
   };
 
+  /**
+   * @param {Parameters<typeof import('./import/convertDocAzureDocIntel.js').convertDocAzureDocIntel>[0]} args
+   * @returns {ReturnType<typeof import('./import/convertDocAzureDocIntel.js').convertDocAzureDocIntel>}
+   */
+  static convertDocAzureDocIntel = async (args) => {
+    await gs.getGeneralScheduler();
+    return gs.schedulerInner.addJob('convertDocAzureDocIntel', args);
+  };
+
   /**
    * @param {Parameters<typeof import('./import/convertPageGoogleVision.js').convertPageGoogleVision>[0]} args
    * @returns {ReturnType<typeof import('./import/convertPageGoogleVision.js').convertPageGoogleVision>}

diff --git a/js/global.d.ts b/js/global.d.ts
@@ -13,7 +13,7 @@ declare global {
 
     // Strings representing supported sources of text.
     // `stext` indicates the text was extracted directly from a PDF using mupdf.
-    type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text';
+    type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel';
 
     type FontState = {
         enableOpt: boolean;
@@ -330,6 +330,62 @@ declare global {
         text: string;
     }
 
+    // Azure Document Intelligence types
+    interface AzureDocIntelPoint {
+        x: number;
+        y: number;
+    }
+
+    interface AzureDocIntelSpan {
+        offset: number;
+        length: number;
+    }
+
+    interface AzureDocIntelWord {
+        content: string;
+        polygon: AzureDocIntelPoint[];
+        span: AzureDocIntelSpan;
+        confidence: number;
+    }
+
+    interface AzureDocIntelLine {
+        content: string;
+        polygon: AzureDocIntelPoint[];
+        spans: AzureDocIntelSpan[];
+    }
+
+    interface AzureDocIntelStyle {
+        isHandwritten?: boolean;
+        spans: AzureDocIntelSpan[];
+        confidence: number;
+    }
+
+    interface AzureDocIntelPage {
+        pageNumber: number;
+        angle: number;
+        width: number;
+        height: number;
+        unit: 'pixel' | 'inch';
+        words: AzureDocIntelWord[];
+        lines: AzureDocIntelLine[];
+        spans: AzureDocIntelSpan[];
+    }
+
+    interface AzureDocIntelAnalyzeResult {
+        apiVersion: string;
+        modelId: string;
+        content: string;
+        pages: AzureDocIntelPage[];
+        styles: AzureDocIntelStyle[];
+    }
+
+    interface AzureDocIntelResponse {
+        status: 'succeeded' | 'failed' | 'running';
+        createdDateTime: string;
+        lastUpdatedDateTime: string;
+        analyzeResult: AzureDocIntelAnalyzeResult;
+    }
+
 }
 
 export { };

diff --git a/js/import/convertDocAzureDocIntel.js b/js/import/convertDocAzureDocIntel.js
@@ -0,0 +1,140 @@
+import ocr from '../objects/ocrObjects.js';
+
+import { LayoutDataTablePage } from '../objects/layoutObjects.js';
+import { pass2, pass3 } from './convertPageShared.js';
+
+// NOTE: This is a WIP and incomplete.
+// The Azure Document Intelligence format is not yet supported.
+
+const debugMode = false;
+
+/**
+ * @param {Object} params
+ * @param {string} params.ocrStr
+ * @param {dims[]} params.pageDims - Page metrics to use for the pages
+ */
+export async function convertDocAzureDocIntel({ ocrStr, pageDims }) {
+  let ocrData;
+  try {
+    ocrData = JSON.parse(ocrStr);
+  } catch (error) {
+    throw new Error('Failed to parse Azure Document Intelligence JSON data.');
+  }
+
+  if (!ocrData.analyzeResult || !ocrData.analyzeResult.pages || !ocrData.analyzeResult.pages[0]) {
+    throw new Error('Invalid Azure Document Intelligence format: missing pages data.');
+  }
+
+  const analyzeResultPages = /** @type {AzureDocIntelPage[]} */ (ocrData.analyzeResult.pages);
+
+  const resArr = [];
+
+  for (let n = 0; n < analyzeResultPages.length; n++) {
+    const pageData = analyzeResultPages[n];
+    const pageDimsN = pageDims[n];
+
+    if (!pageData.width || !pageData.height) {
+      throw new Error('Failed to parse page dimensions.');
+    }
+
+    const pageObj = new ocr.OcrPage(n, pageDimsN);
+    pageObj.textSource = 'azure_doc_intel';
+
+    if (!pageData.words || pageData.words.length === 0) {
+      const warn = { char: 'char_error' };
+      resArr.push({
+        pageObj, charMetricsObj: {}, dataTables: new LayoutDataTablePage(n), warn,
+      });
+    }
+
+    if (pageData.unit !== 'pixel') {
+      if (!pageDimsN || !pageDimsN.width || !pageDimsN.height) {
+        throw new Error('Page dimensions must be provided for non-pixel units.');
+      }
+
+      const pageDimsMult = {
+        width: pageDimsN.width / pageData.width,
+        height: pageDimsN.height / pageData.height,
+      };
+
+      pageData.lines.forEach((line) => {
+        line.polygon = line.polygon.map((val, idx) => (idx % 2 === 0 ? val * pageDimsMult.width : val * pageDimsMult.height));
+      });
+
+      pageData.words.forEach((word) => {
+        word.polygon = word.polygon.map((val, idx) => (idx % 2 === 0 ? val * pageDimsMult.width : val * pageDimsMult.height));
+      });
+    }
+
+    for (let i = 0; i < pageData.lines.length; i++) {
+      const lineWordsInput = /** @type {AzureDocIntelWord[]} */ ([]);
+      for (let j = 0; j < pageData.lines[i].spans.length; j++) {
+        const span = pageData.lines[i].spans[j];
+        for (let k = 0; k < pageData.words.length; k++) {
+          const wordSpan = pageData.words[k].span;
+          if (wordSpan.offset >= span.offset && (wordSpan.offset + wordSpan.length) <= (span.offset + span.length)) {
+            lineWordsInput.push(pageData.words[k]);
+          }
+        }
+      }
+
+      if (lineWordsInput.length === 0) continue;
+
+      const allX = lineWordsInput.flatMap((w) => w.polygon.filter((_, i) => i % 2 === 0));
+      const allY = lineWordsInput.flatMap((w) => w.polygon.filter((_, i) => i % 2 === 1));
+
+      const lineBbox = {
+        left: Math.min(...allX),
+        top: Math.min(...allY),
+        right: Math.max(...allX),
+        bottom: Math.max(...allY),
+      };
+
+      const baseline = [0, 0];
+
+      const lineObj = new ocr.OcrLine(pageObj, lineBbox, baseline);
+      if (debugMode) lineObj.raw = JSON.stringify(lineWordsInput);
+
+      for (let j = 0; j < lineWordsInput.length; j++) {
+        const wordData = lineWordsInput[j];
+
+        if (!wordData.content || wordData.content.trim() === '') continue;
+
+        const wordX = wordData.polygon.filter((_, i) => i % 2 === 0);
+        const wordY = wordData.polygon.filter((_, i) => i % 2 === 1);
+
+        const wordBbox = {
+          left: Math.min(...wordX),
+          top: Math.min(...wordY),
+          right: Math.max(...wordX),
+          bottom: Math.max(...wordY),
+        };
+
+        const wordId = `word_${n + 1}_${pageObj.lines.length + 1}_${j + 1}`;
+        const wordObj = new ocr.OcrWord(lineObj, wordId, wordData.content, wordBbox);
+
+        wordObj.conf = Math.round((wordData.confidence || 0) * 100);
+
+        if (debugMode) wordObj.raw = JSON.stringify(wordData);
+
+        lineObj.words.push(wordObj);
+      }
+
+      if (lineObj.words.length > 0) {
+        pageObj.lines.push(lineObj);
+      }
+    }
+
+    const pageAngle = pageData.angle || 0;
+    pageObj.angle = pageAngle;
+
+    // pass2(pageObj, 0);
+    const langSet = pass3(pageObj);
+
+    const dataTables = new LayoutDataTablePage(n);
+
+    resArr.push({ pageObj, dataTables, langSet });
+  }
+
+  return resArr;
+}
+1 −0		.gitignore
+170,608 −0		dev/assets/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations-AwsTextractLayout.json
+211,209 −0		dev/assets/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations-GoogleVision.json
+ −		dev/assets/CSF_Proposed_Budget_Book_June_2024_r8_30_all_orientations.pdf
+254,676 −0		dev/assets/econometrica_example_all_orientations-AwsTextractLayout.json
+441,805 −0		dev/assets/econometrica_example_all_orientations-GoogleVision.json
+ −		dev/assets/econometrica_example_all_orientations.pdf
+ −		dev/assets/simple.png
+1,888 −0		dev/assets/simple_all_orientations-AwsTextractLayout.json
+ −		dev/assets/simple_all_orientations.pdf
+2,471 −0		dev/assets/testocr-AwsTextractLayoutSync.json
+ −		dev/assets/testocr.png
+31,384 −0		dev/assets/testocr_all_orientations-AwsTextractLayout.json
+ −		dev/assets/testocr_all_orientations.pdf
+45 −0		dev/runAwsTextractAsync.js
+32 −0		dev/runAwsTextractSync.js
+38 −0		dev/runGoogleVisionAsync.js
+27 −0		dev/runGoogleVisionSync.js
+358 −0		ocrEngineAwsTextract.js
+211 −0		ocrEngineGoogleVision.js
+3,452 −0		package-lock.json
+23 −0		package.json