scribeocr · Balearica · Dec 7, 2025 · Dec 7, 2025
diff --git a/js/generalWorkerMain.js b/js/generalWorkerMain.js
@@ -94,6 +94,7 @@ export async function initGeneralWorker() {
 
     obj.convertPageHocr = wrap('convertPageHocr');
     obj.convertPageAbbyy = wrap('convertPageAbbyy');
+    obj.convertPageAlto = wrap('convertPageAlto');
     obj.convertPageStext = wrap('convertPageStext');
     obj.convertDocTextract = wrap('convertDocTextract');
     obj.convertDocAzureDocIntel = wrap('convertDocAzureDocIntel');
@@ -184,6 +185,15 @@ export class gs {
     return gs.schedulerInner.addJob('convertPageAbbyy', args);
   };
 
+  /**
+   * @param {Parameters<typeof import('./import/convertPageAlto.js').convertPageAlto>[0]} args
+   * @returns {ReturnType<typeof import('./import/convertPageAlto.js').convertPageAlto>}
+   */
+  static convertPageAlto = async (args) => {
+    await gs.getGeneralScheduler();
+    return gs.schedulerInner.addJob('convertPageAlto', args);
+  };
+
   /**
    * @param {Parameters<typeof import('./import/convertDocTextract.js').convertDocTextract>[0]} args
    * @returns {ReturnType<typeof import('./import/convertDocTextract.js').convertDocTextract>}

diff --git a/js/global.d.ts b/js/global.d.ts
@@ -13,7 +13,7 @@ declare global {
 
     // Strings representing supported sources of text.
     // `stext` indicates the text was extracted directly from a PDF using mupdf.
-    type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel';
+    type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'alto' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel';
 
     type FontState = {
         enableOpt: boolean;

diff --git a/js/import/convertPageAlto.js b/js/import/convertPageAlto.js
@@ -0,0 +1,180 @@
+import ocr from '../objects/ocrObjects.js';
+
+import {
+  unescapeXml,
+} from '../utils/miscUtils.js';
+
+import { LayoutDataTablePage } from '../objects/layoutObjects.js';
+import { pass2, pass3 } from './convertPageShared.js';
+
+// This import is a WIP and does not produce a text layer that closely overlays the source document.
+// While the result can likely improved, this is caused in part by limitations in the ALTO format itself.
+// The ALTO format only includes the most basic positioning information (word-level bounding boxes).
+
+const debugMode = false;
+
+/**
+ * @param {Object} params
+ * @param {string} params.ocrStr
+ * @param {number} params.n
+ */
+export async function convertPageAlto({ ocrStr, n }) {
+  const pageElement = ocrStr.match(/<Page[^>]+>/i);
+  if (!pageElement) throw new Error('Failed to parse ALTO page element.');
+
+  const heightStr = pageElement[0].match(/HEIGHT=["'](\d+)["']/i)?.[1];
+  const widthStr = pageElement[0].match(/WIDTH=["'](\d+)["']/i)?.[1];
+
+  if (!heightStr || !widthStr) throw new Error('Failed to parse page dimensions.');
+
+  const pageDims = { height: parseInt(heightStr), width: parseInt(widthStr) };
+
+  const pageObj = new ocr.OcrPage(n, pageDims);
+
+  const textLineRegex = /<TextLine[^>]*>([\s\S]*?)<\/TextLine>/gi;
+
+  /**
+   * Extract attribute value from an XML element string
+   * @param {string} elemStr
+   * @param {string} attrName
+   * @returns {?string}
+   */
+  function getAttr(elemStr, attrName) {
+    const regex = new RegExp(`${attrName}=["']([^"']+)["']`, 'i');
+    return elemStr.match(regex)?.[1] || null;
+  }
+
+  /**
+   * @param {string} match - The TextLine element match
+   * @param {?string} blockStyleRefs - STYLEREFS from parent TextBlock
+   */
+  function convertLine(match, blockStyleRefs = null) {
+    const textLineTag = match.match(/<TextLine[^>]+>/i)?.[0];
+    if (!textLineTag) return '';
+
+    const lineVposStr = getAttr(textLineTag, 'VPOS');
+    const lineHposStr = getAttr(textLineTag, 'HPOS');
+    const lineHeightheightStr = getAttr(textLineTag, 'HEIGHT');
+    const lineWidthStr = getAttr(textLineTag, 'WIDTH');
+
+    if (!lineVposStr || !lineHposStr || !lineHeightheightStr || !lineWidthStr) {
+      console.warn('Missing required positional attributes in ALTO TextLine element, skipping line.');
+      return '';
+    }
+
+    const linebox = {
+      left: parseInt(lineHposStr),
+      top: parseInt(lineVposStr),
+      right: parseInt(lineHposStr) + parseInt(lineWidthStr),
+      bottom: parseInt(lineVposStr) + parseInt(lineHeightheightStr),
+    };
+
+    const baseline = [0, 0];
+
+    // Height used as rough estimate for ascender height
+    const height = parseInt(lineHeightheightStr);
+    const lineAscHeightFinal = height * 0.75;
+    const lineXHeightFinal = height * 0.5;
+
+    const lineObj = new ocr.OcrLine(pageObj, linebox, baseline, lineAscHeightFinal, lineXHeightFinal);
+
+    if (debugMode) lineObj.raw = match;
+
+    const contentRegex = /<(?:String)\s+[^>]+\/?>/gi;
+    const contentMatches = [...match.matchAll(contentRegex)];
+
+    for (let i = 0; i < contentMatches.length; i++) {
+      const contentMatch = contentMatches[i][0];
+
+      const content = getAttr(contentMatch, 'CONTENT');
+      if (!content) continue;
+
+      const text = unescapeXml(content);
+      const strHpos = getAttr(contentMatch, 'HPOS');
+      const strVpos = getAttr(contentMatch, 'VPOS');
+      const strHeight = getAttr(contentMatch, 'HEIGHT');
+      const strWidth = getAttr(contentMatch, 'WIDTH');
+
+      if (!strHpos || !strVpos || !strHeight || !strWidth) {
+        console.warn('Missing required positional attributes in ALTO String element, skipping element.');
+        continue;
+      }
+
+      const wordBox = {
+        left: parseInt(strHpos),
+        top: parseInt(strVpos),
+        right: parseInt(strHpos) + parseInt(strWidth),
+        bottom: parseInt(strVpos) + parseInt(strHeight),
+      };
+
+      const wordID = `word_${n + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`;
+      const wordObj = new ocr.OcrWord(lineObj, wordID, text, wordBox);
+
+      const wcStr = getAttr(contentMatch, 'WC');
+      if (wcStr) {
+        wordObj.conf = Math.round(parseFloat(wcStr) * 100);
+      }
+
+      // Parse style attributes
+      const styleAttr = getAttr(contentMatch, 'STYLE');
+      if (styleAttr) {
+        if (/bold/i.test(styleAttr)) wordObj.style.bold = true;
+        if (/italic/i.test(styleAttr)) wordObj.style.italic = true;
+        if (/underline/i.test(styleAttr)) wordObj.style.underline = true;
+        if (/superscript/i.test(styleAttr)) wordObj.style.sup = true;
+        if (/smallcaps/i.test(styleAttr)) wordObj.style.smallCaps = true;
+      }
+
+      // Parse STYLEREFS to get font information
+      // Use String's STYLEREFS first, fall back to TextBlock's STYLEREFS
+      const styleRefs = getAttr(contentMatch, 'STYLEREFS') || blockStyleRefs;
+      if (styleRefs) {
+        // Look up the TextStyle definition in the document
+        const styleRegex = new RegExp(`<TextStyle\\s*ID=["']${styleRefs}["'][^>]*>`, 'i');
+        const styleMatch = ocrStr.match(styleRegex);
+        if (styleMatch) {
+          const fontFamily = getAttr(styleMatch[0], 'FONTFAMILY');
+          if (fontFamily) wordObj.style.font = fontFamily;
+
+          const fontSize = getAttr(styleMatch[0], 'FONTSIZE');
+          if (fontSize) wordObj.style.size = parseInt(fontSize);
+        }
+      }
+
+      if (debugMode) wordObj.raw = contentMatch;
+
+      lineObj.words.push(wordObj);
+    }
+
+    if (lineObj.words.length > 0) {
+      pageObj.lines.push(lineObj);
+    }
+
+    return '';
+  }
+
+  const textBlockRegex = /<TextBlock[^>]*>([\s\S]*?)<\/TextBlock>/gi;
+  const textBlockMatches = [...ocrStr.matchAll(textBlockRegex)];
+
+  for (const blockMatch of textBlockMatches) {
+    const blockTag = blockMatch[0].match(/<TextBlock[^>]+>/i)?.[0];
+    const blockStyleRefs = blockTag ? getAttr(blockTag, 'STYLEREFS') : null;
+    const blockContent = blockMatch[1];
+
+    const textLinesInBlock = [...blockContent.matchAll(textLineRegex)];
+    for (const lineMatch of textLinesInBlock) {
+      convertLine(lineMatch[0], blockStyleRefs);
+    }
+  }
+
+  const warn = { char: 'char_warning' };
+
+  pass2(pageObj, 0);
+  const langSet = pass3(pageObj);
+
+  const dataTablePage = new LayoutDataTablePage(n);
+
+  return {
+    pageObj, dataTables: dataTablePage, warn, langSet,
+  };
+}
diff --git a/js/import/import.js b/js/import/import.js
@@ -397,7 +397,7 @@ export async function importFiles(files) {
 
     const ocrData = await importOCRFiles(Array.from(ocrFiles));
 
-    format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
+    format = /** @type {("hocr" | "abbyy" | "alto" | "stext" | "textract" | "text")} */ (ocrData.format);
 
     // The text import function requires built-in fonts to be loaded.
     if (format === 'text') {
@@ -415,7 +415,7 @@ export async function importFiles(files) {
       ocrAllRaw.active = ocrAllRaw.active.slice(0, pageCountImage);
     }
 
-    format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
+    format = /** @type {("hocr" | "abbyy" | "alto" | "stext" | "textract" | "text")} */ (ocrData.format);
     reimportHocrMode = ocrData.reimportHocrMode;
 
     if (ocrData.reimportHocrMode) {

diff --git a/js/import/importOCR.js b/js/import/importOCR.js
@@ -24,6 +24,11 @@ const detectOcrFormat = (ocrStr, ext) => {
     }
   }
 
+  // Check whether input is ALTO XML
+  if (/<alto[\s>]/i.test(ocrStr) && /xmlns="http:\/\/www\.loc\.gov\/standards\/alto/i.test(ocrStr)) {
+    return 'alto';
+  }
+
   // Check whether input is Abbyy XML
   // TODO: The auto-detection of formats needs to be more robust.
   // At present, any string that contains ">" and "abbyy" is considered Abbyy XML.
@@ -114,6 +119,16 @@ export async function importOCRFiles(ocrFilesAll) {
       }
     } else if (format === 'azure_doc_intel') {
       hocrRaw = [hocrStrAll];
+    } else if (format === 'alto') {
+      // Extract the Styles section to prepend to each page
+      const stylesMatch = hocrStrAll.match(/<Styles>[\s\S]*?<\/Styles>/i);
+      const stylesSection = stylesMatch ? stylesMatch[0] : '';
+
+      // Split by Page elements
+      const pages = hocrStrAll.split(/(?=<Page\s)/).slice(1);
+
+      // Prepend Styles section to each page so font lookups work
+      hocrRaw = pages.map((page) => stylesSection + page);
     } else if (format === 'abbyy') {
       hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
     } else if (format === 'stext') {

diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js
@@ -302,6 +302,8 @@ export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, sc
     res = await gs.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode });
   } else if (format === 'abbyy') {
     res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n });
+  } else if (format === 'alto') {
+    res = await gs.convertPageAlto({ ocrStr: ocrRaw, n });
   } else if (format === 'textract') {
     // res = await gs.convertPageTextract({ ocrStr: ocrRaw, n });
   } else if (format === 'azure_doc_intel') {

diff --git a/js/worker/generalWorker.js b/js/worker/generalWorker.js
@@ -1,4 +1,5 @@
 import { convertPageAbbyy } from '../import/convertPageAbbyy.js';
+import { convertPageAlto } from '../import/convertPageAlto.js';
 import { convertPageBlocks } from '../import/convertPageBlocks.js';
 import { convertPageHocr } from '../import/convertPageHocr.js';
 import { convertPageStext } from '../import/convertPageStext.js';
@@ -400,6 +401,7 @@ const handleMessage = async (data) => {
   ({
     // Convert page functions
     convertPageAbbyy,
+    convertPageAlto,
     convertPageHocr,
     convertPageStext,
     convertDocTextract,

diff --git a/tests/module/importAbbyy.spec.js b/tests/module/importAbbyy.spec.js
@@ -56,6 +56,30 @@ describe('Check Abbyy XML import function.', function () {
   });
 }).timeout(120000);
 
+describe('Check that text orientation is handled correctly in Abbyy imports (simple layout).', function () {
+  this.timeout(10000);
+
+  it('Lines printed at exactly 90/180/270 degrees have orientation detected correctly', async () => {
+    await scribe.importFiles([`${ASSETS_PATH_KARMA}/testocr_all_orientations.abbyy.xml`]);
+    assert.strictEqual(scribe.data.ocr.active[0].lines[0].words[0].line.orientation, 0);
+    assert.strictEqual(scribe.data.ocr.active[1].lines[0].words[0].line.orientation, 0);
+    assert.strictEqual(scribe.data.ocr.active[2].lines[0].words[0].line.orientation, 0);
+    assert.strictEqual(scribe.data.ocr.active[3].lines[0].words[0].line.orientation, 3);
+    assert.strictEqual(scribe.data.ocr.active[4].lines[0].words[0].line.orientation, 3);
+    assert.strictEqual(scribe.data.ocr.active[5].lines[0].words[0].line.orientation, 3);
+    assert.strictEqual(scribe.data.ocr.active[6].lines[0].words[0].line.orientation, 2);
+    assert.strictEqual(scribe.data.ocr.active[7].lines[0].words[0].line.orientation, 2);
+    assert.strictEqual(scribe.data.ocr.active[8].lines[0].words[0].line.orientation, 2);
+    assert.strictEqual(scribe.data.ocr.active[9].lines[0].words[0].line.orientation, 1);
+    assert.strictEqual(scribe.data.ocr.active[10].lines[0].words[0].line.orientation, 1);
+    assert.strictEqual(scribe.data.ocr.active[11].lines[0].words[0].line.orientation, 1);
+  }).timeout(10000);
+
+  after(async () => {
+    await scribe.terminate();
+  });
+}).timeout(120000);
+
 describe('Check that text orientation is handled correctly in Abbyy imports.', function () {
   this.timeout(10000);