Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions js/generalWorkerMain.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ export async function initGeneralWorker() {

obj.convertPageHocr = wrap('convertPageHocr');
obj.convertPageAbbyy = wrap('convertPageAbbyy');
obj.convertPageAlto = wrap('convertPageAlto');
obj.convertPageStext = wrap('convertPageStext');
obj.convertDocTextract = wrap('convertDocTextract');
obj.convertDocAzureDocIntel = wrap('convertDocAzureDocIntel');
Expand Down Expand Up @@ -184,6 +185,15 @@ export class gs {
return gs.schedulerInner.addJob('convertPageAbbyy', args);
};

/**
* @param {Parameters<typeof import('./import/convertPageAlto.js').convertPageAlto>[0]} args
* @returns {ReturnType<typeof import('./import/convertPageAlto.js').convertPageAlto>}
*/
static convertPageAlto = async (args) => {
await gs.getGeneralScheduler();
return gs.schedulerInner.addJob('convertPageAlto', args);
};

/**
* @param {Parameters<typeof import('./import/convertDocTextract.js').convertDocTextract>[0]} args
* @returns {ReturnType<typeof import('./import/convertDocTextract.js').convertDocTextract>}
Expand Down
2 changes: 1 addition & 1 deletion js/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ declare global {

// Strings representing supported sources of text.
// `stext` indicates the text was extracted directly from a PDF using mupdf.
type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel';
type TextSource = null | 'tesseract' | 'textract' | 'google_vision' | 'abbyy' | 'alto' | 'stext' | 'hocr' | 'text' | 'azure_doc_intel';

type FontState = {
enableOpt: boolean;
Expand Down
180 changes: 180 additions & 0 deletions js/import/convertPageAlto.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
import ocr from '../objects/ocrObjects.js';

import {
unescapeXml,
} from '../utils/miscUtils.js';

import { LayoutDataTablePage } from '../objects/layoutObjects.js';
import { pass2, pass3 } from './convertPageShared.js';

// This import is a WIP and does not produce a text layer that closely overlays the source document.
// While the result can likely improved, this is caused in part by limitations in the ALTO format itself.
// The ALTO format only includes the most basic positioning information (word-level bounding boxes).

const debugMode = false;

/**
* @param {Object} params
* @param {string} params.ocrStr
* @param {number} params.n
*/
export async function convertPageAlto({ ocrStr, n }) {
const pageElement = ocrStr.match(/<Page[^>]+>/i);
if (!pageElement) throw new Error('Failed to parse ALTO page element.');

const heightStr = pageElement[0].match(/HEIGHT=["'](\d+)["']/i)?.[1];
const widthStr = pageElement[0].match(/WIDTH=["'](\d+)["']/i)?.[1];

if (!heightStr || !widthStr) throw new Error('Failed to parse page dimensions.');

const pageDims = { height: parseInt(heightStr), width: parseInt(widthStr) };

const pageObj = new ocr.OcrPage(n, pageDims);

const textLineRegex = /<TextLine[^>]*>([\s\S]*?)<\/TextLine>/gi;

/**
* Extract attribute value from an XML element string
* @param {string} elemStr
* @param {string} attrName
* @returns {?string}
*/
function getAttr(elemStr, attrName) {
const regex = new RegExp(`${attrName}=["']([^"']+)["']`, 'i');
return elemStr.match(regex)?.[1] || null;
}

/**
* @param {string} match - The TextLine element match
* @param {?string} blockStyleRefs - STYLEREFS from parent TextBlock
*/
function convertLine(match, blockStyleRefs = null) {
const textLineTag = match.match(/<TextLine[^>]+>/i)?.[0];
if (!textLineTag) return '';

const lineVposStr = getAttr(textLineTag, 'VPOS');
const lineHposStr = getAttr(textLineTag, 'HPOS');
const lineHeightheightStr = getAttr(textLineTag, 'HEIGHT');
const lineWidthStr = getAttr(textLineTag, 'WIDTH');

if (!lineVposStr || !lineHposStr || !lineHeightheightStr || !lineWidthStr) {
console.warn('Missing required positional attributes in ALTO TextLine element, skipping line.');
return '';
}

const linebox = {
left: parseInt(lineHposStr),
top: parseInt(lineVposStr),
right: parseInt(lineHposStr) + parseInt(lineWidthStr),
bottom: parseInt(lineVposStr) + parseInt(lineHeightheightStr),
};

const baseline = [0, 0];

// Height used as rough estimate for ascender height
const height = parseInt(lineHeightheightStr);
const lineAscHeightFinal = height * 0.75;
const lineXHeightFinal = height * 0.5;

const lineObj = new ocr.OcrLine(pageObj, linebox, baseline, lineAscHeightFinal, lineXHeightFinal);

if (debugMode) lineObj.raw = match;

const contentRegex = /<(?:String)\s+[^>]+\/?>/gi;
const contentMatches = [...match.matchAll(contentRegex)];

for (let i = 0; i < contentMatches.length; i++) {
const contentMatch = contentMatches[i][0];

const content = getAttr(contentMatch, 'CONTENT');
if (!content) continue;

const text = unescapeXml(content);
const strHpos = getAttr(contentMatch, 'HPOS');
const strVpos = getAttr(contentMatch, 'VPOS');
const strHeight = getAttr(contentMatch, 'HEIGHT');
const strWidth = getAttr(contentMatch, 'WIDTH');

if (!strHpos || !strVpos || !strHeight || !strWidth) {
console.warn('Missing required positional attributes in ALTO String element, skipping element.');
continue;
}

const wordBox = {
left: parseInt(strHpos),
top: parseInt(strVpos),
right: parseInt(strHpos) + parseInt(strWidth),
bottom: parseInt(strVpos) + parseInt(strHeight),
};

const wordID = `word_${n + 1}_${pageObj.lines.length + 1}_${lineObj.words.length + 1}`;
const wordObj = new ocr.OcrWord(lineObj, wordID, text, wordBox);

const wcStr = getAttr(contentMatch, 'WC');
if (wcStr) {
wordObj.conf = Math.round(parseFloat(wcStr) * 100);
}

// Parse style attributes
const styleAttr = getAttr(contentMatch, 'STYLE');
if (styleAttr) {
if (/bold/i.test(styleAttr)) wordObj.style.bold = true;
if (/italic/i.test(styleAttr)) wordObj.style.italic = true;
if (/underline/i.test(styleAttr)) wordObj.style.underline = true;
if (/superscript/i.test(styleAttr)) wordObj.style.sup = true;
if (/smallcaps/i.test(styleAttr)) wordObj.style.smallCaps = true;
}

// Parse STYLEREFS to get font information
// Use String's STYLEREFS first, fall back to TextBlock's STYLEREFS
const styleRefs = getAttr(contentMatch, 'STYLEREFS') || blockStyleRefs;
if (styleRefs) {
// Look up the TextStyle definition in the document
const styleRegex = new RegExp(`<TextStyle\\s*ID=["']${styleRefs}["'][^>]*>`, 'i');
const styleMatch = ocrStr.match(styleRegex);
if (styleMatch) {
const fontFamily = getAttr(styleMatch[0], 'FONTFAMILY');
if (fontFamily) wordObj.style.font = fontFamily;

const fontSize = getAttr(styleMatch[0], 'FONTSIZE');
if (fontSize) wordObj.style.size = parseInt(fontSize);
}
}

if (debugMode) wordObj.raw = contentMatch;

lineObj.words.push(wordObj);
}

if (lineObj.words.length > 0) {
pageObj.lines.push(lineObj);
}

return '';
}

const textBlockRegex = /<TextBlock[^>]*>([\s\S]*?)<\/TextBlock>/gi;
const textBlockMatches = [...ocrStr.matchAll(textBlockRegex)];

for (const blockMatch of textBlockMatches) {
const blockTag = blockMatch[0].match(/<TextBlock[^>]+>/i)?.[0];
const blockStyleRefs = blockTag ? getAttr(blockTag, 'STYLEREFS') : null;
const blockContent = blockMatch[1];

const textLinesInBlock = [...blockContent.matchAll(textLineRegex)];
for (const lineMatch of textLinesInBlock) {
convertLine(lineMatch[0], blockStyleRefs);
}
}

const warn = { char: 'char_warning' };

pass2(pageObj, 0);
const langSet = pass3(pageObj);

const dataTablePage = new LayoutDataTablePage(n);

return {
pageObj, dataTables: dataTablePage, warn, langSet,
};
}
4 changes: 2 additions & 2 deletions js/import/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ export async function importFiles(files) {

const ocrData = await importOCRFiles(Array.from(ocrFiles));

format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
format = /** @type {("hocr" | "abbyy" | "alto" | "stext" | "textract" | "text")} */ (ocrData.format);

// The text import function requires built-in fonts to be loaded.
if (format === 'text') {
Expand All @@ -415,7 +415,7 @@ export async function importFiles(files) {
ocrAllRaw.active = ocrAllRaw.active.slice(0, pageCountImage);
}

format = /** @type {("hocr" | "abbyy" | "stext" | "textract" | "text")} */ (ocrData.format);
format = /** @type {("hocr" | "abbyy" | "alto" | "stext" | "textract" | "text")} */ (ocrData.format);
reimportHocrMode = ocrData.reimportHocrMode;

if (ocrData.reimportHocrMode) {
Expand Down
15 changes: 15 additions & 0 deletions js/import/importOCR.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ const detectOcrFormat = (ocrStr, ext) => {
}
}

// Check whether input is ALTO XML
if (/<alto[\s>]/i.test(ocrStr) && /xmlns="http:\/\/www\.loc\.gov\/standards\/alto/i.test(ocrStr)) {
return 'alto';
}

// Check whether input is Abbyy XML
// TODO: The auto-detection of formats needs to be more robust.
// At present, any string that contains ">" and "abbyy" is considered Abbyy XML.
Expand Down Expand Up @@ -114,6 +119,16 @@ export async function importOCRFiles(ocrFilesAll) {
}
} else if (format === 'azure_doc_intel') {
hocrRaw = [hocrStrAll];
} else if (format === 'alto') {
// Extract the Styles section to prepend to each page
const stylesMatch = hocrStrAll.match(/<Styles>[\s\S]*?<\/Styles>/i);
const stylesSection = stylesMatch ? stylesMatch[0] : '';

// Split by Page elements
const pages = hocrStrAll.split(/(?=<Page\s)/).slice(1);

// Prepend Styles section to each page so font lookups work
hocrRaw = pages.map((page) => stylesSection + page);
} else if (format === 'abbyy') {
hocrRaw = hocrStrAll.split(/(?=<page)/).slice(1);
} else if (format === 'stext') {
Expand Down
2 changes: 2 additions & 0 deletions js/recognizeConvert.js
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, sc
res = await gs.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode });
} else if (format === 'abbyy') {
res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n });
} else if (format === 'alto') {
res = await gs.convertPageAlto({ ocrStr: ocrRaw, n });
} else if (format === 'textract') {
// res = await gs.convertPageTextract({ ocrStr: ocrRaw, n });
} else if (format === 'azure_doc_intel') {
Expand Down
2 changes: 2 additions & 0 deletions js/worker/generalWorker.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { convertPageAbbyy } from '../import/convertPageAbbyy.js';
import { convertPageAlto } from '../import/convertPageAlto.js';
import { convertPageBlocks } from '../import/convertPageBlocks.js';
import { convertPageHocr } from '../import/convertPageHocr.js';
import { convertPageStext } from '../import/convertPageStext.js';
Expand Down Expand Up @@ -400,6 +401,7 @@ const handleMessage = async (data) => {
({
// Convert page functions
convertPageAbbyy,
convertPageAlto,
convertPageHocr,
convertPageStext,
convertDocTextract,
Expand Down
24 changes: 24 additions & 0 deletions tests/module/importAbbyy.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,30 @@ describe('Check Abbyy XML import function.', function () {
});
}).timeout(120000);

describe('Check that text orientation is handled correctly in Abbyy imports (simple layout).', function () {
this.timeout(10000);

it('Lines printed at exactly 90/180/270 degrees have orientation detected correctly', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/testocr_all_orientations.abbyy.xml`]);
assert.strictEqual(scribe.data.ocr.active[0].lines[0].words[0].line.orientation, 0);
assert.strictEqual(scribe.data.ocr.active[1].lines[0].words[0].line.orientation, 0);
assert.strictEqual(scribe.data.ocr.active[2].lines[0].words[0].line.orientation, 0);
assert.strictEqual(scribe.data.ocr.active[3].lines[0].words[0].line.orientation, 3);
assert.strictEqual(scribe.data.ocr.active[4].lines[0].words[0].line.orientation, 3);
assert.strictEqual(scribe.data.ocr.active[5].lines[0].words[0].line.orientation, 3);
assert.strictEqual(scribe.data.ocr.active[6].lines[0].words[0].line.orientation, 2);
assert.strictEqual(scribe.data.ocr.active[7].lines[0].words[0].line.orientation, 2);
assert.strictEqual(scribe.data.ocr.active[8].lines[0].words[0].line.orientation, 2);
assert.strictEqual(scribe.data.ocr.active[9].lines[0].words[0].line.orientation, 1);
assert.strictEqual(scribe.data.ocr.active[10].lines[0].words[0].line.orientation, 1);
assert.strictEqual(scribe.data.ocr.active[11].lines[0].words[0].line.orientation, 1);
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

describe('Check that text orientation is handled correctly in Abbyy imports.', function () {
this.timeout(10000);

Expand Down
Loading