Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions build-deno-compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@ mkdir -p build
# Build for different platforms
echo "Building for Linux x64..."
deno compile --allow-sys --allow-read --allow-write --target x86_64-unknown-linux-gnu --output build/scribe-linux-x64 cli/scribe.js
# deno compile --allow-sys --allow-read --allow-write --target x86_64-unknown-linux-gnu --output build/scribe-linux-x64 --include mupdf --include fonts --include js/worker cli/scribe.js

echo "Building for macOS x64..."
deno compile --allow-sys --allow-read --allow-write --target x86_64-apple-darwin --output build/scribe-macos-x64 cli/scribe.js
# deno compile --allow-sys --allow-read --allow-write --target x86_64-apple-darwin --output build/scribe-macos-x64 --include mupdf --include fonts --include js/worker cli/scribe.js

echo "Building for Windows x64..."
deno compile --allow-sys --allow-read --allow-write --target x86_64-pc-windows-msvc --output build/scribe-windows-x64.exe cli/scribe.js
# deno compile --allow-sys --allow-read --allow-write --target x86_64-pc-windows-msvc --output build/scribe-windows-x64.exe --include mupdf --include fonts --include js/worker cli/scribe.js

# Create checksums
cd build
Expand Down
Binary file added fonts/Dingbats.woff
Binary file not shown.
3 changes: 2 additions & 1 deletion js/export/writeHtml.js
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,8 @@ export function writeHtml({
top += pageMetricsArr[g].dims.height + 10;
}

if (reflowText) {
// Do not overwrite paragraphs from Abbyy or Textract.
if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) {
const angle = pageMetricsArr[g].angle || 0;
assignParagraphs(pageObj, angle);
}
Expand Down
3 changes: 2 additions & 1 deletion js/export/writeText.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ export function writeText(ocrCurrent, minpage = 0, maxpage = -1, reflowText = fa

const pageObj = ocrCurrent[g];

if (reflowText) {
// Do not overwrite paragraphs from Abbyy or Textract.
if (reflowText && (!pageObj.textSource || !['textract', 'abbyy'].includes(pageObj.textSource))) {
const angle = pageMetricsArr[g].angle || 0;
assignParagraphs(pageObj, angle);
}
Expand Down
29 changes: 29 additions & 0 deletions js/fontContainerMain.js
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,35 @@ export async function loadChiSimFont() {
return chiReady;
}

let dingbatsReadyRes;
let dingbatsReady;

/**
* Loads dingbats font. Returns early if already loaded.
*/
export async function loadDingbatsFont() {
console.log('Loading Dingbats font');
if (dingbatsReady) return dingbatsReady;

dingbatsReady = new Promise((resolve, reject) => {
dingbatsReadyRes = resolve;
});

let /** @type {Promise<ArrayBuffer>} */ dingbatsSrc;
if (typeof process === 'undefined') {
dingbatsSrc = fetch(new URL('../fonts/Dingbats.woff', import.meta.url)).then((res) => res.arrayBuffer());
} else {
const { readFile } = await import('node:fs/promises');
dingbatsSrc = readFile(new URL('../fonts/Dingbats.woff', import.meta.url)).then((res) => res.buffer);
}

FontCont.supp.dingbats = await loadFont('Dingbats', 'normal', 'sans', await dingbatsSrc, false);

dingbatsReadyRes();

return dingbatsReady;
}

/**
* Enable or disable font optimization settings.
* This function is used rather than exposing the settings using the `opt` object, as these settings exist on the font container in both the main thread and the worker threads.
Expand Down
7 changes: 7 additions & 0 deletions js/generalWorkerMain.js
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ export async function initGeneralWorker() {
obj.convertPageHocr = wrap('convertPageHocr');
obj.convertPageAbbyy = wrap('convertPageAbbyy');
obj.convertPageStext = wrap('convertPageStext');
obj.convertDocTextract = wrap('convertDocTextract');

obj.optimizeFont = wrap('optimizeFont');

Expand Down Expand Up @@ -174,6 +175,12 @@ export class gs {
*/
static convertPageAbbyy = async (args) => (await gs.schedulerInner.addJob('convertPageAbbyy', args));

/**
* @param {Parameters<typeof import('./import/convertDocTextract.js').convertDocTextract>[0]} args
* @returns {ReturnType<typeof import('./import/convertDocTextract.js').convertDocTextract>}
*/
static convertDocTextract = async (args) => (await gs.schedulerInner.addJob('convertDocTextract', args));

/**
* @param {Parameters<typeof import('./import/convertPageStext.js').convertPageStext>[0]} args
* @returns {ReturnType<typeof import('./import/convertPageStext.js').convertPageStext>}
Expand Down
51 changes: 51 additions & 0 deletions js/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ declare global {
sup: boolean;
dropcap: boolean;
};

// Strings representing supported sources of text.
// `stext` indicates the text was extracted directly from a PDF using mupdf.
type TextSource = null | 'tesseract' | 'textract' | 'abbyy' | 'stext' | 'hocr';

type FontState = {
enableOpt: boolean;
Expand Down Expand Up @@ -176,6 +180,53 @@ declare global {
type LayoutDataColumn = import("./objects/layoutObjects.js").LayoutDataColumn;
type LayoutRegion = import("./objects/layoutObjects.js").LayoutRegion;

interface Point {
x: number;
y: number;
}

interface Polygon {
br: Point;
bl: Point;
tr: Point;
tl: Point;
}

interface TextractBoundingBox {
Width: number;
Height: number;
Left: number;
Top: number;
}

interface TextractPoint {
X: number;
Y: number;
}

interface TextractGeometry {
BoundingBox: TextractBoundingBox;
Polygon: TextractPoint[];
RotationAngle: number;
}

interface Relationship {
Type: string;
Ids: string[];
}

interface TextractBlock {
BlockType: "WORD" | "LINE" | "PAGE" | "KEY_VALUE_SET" | "CELL" | "MERGED_CELL" | "SELECTION_ELEMENT" | "TABLE";
Confidence: number;
Text: string;
TextType: "PRINTED" | "HANDWRITING";
Geometry: TextractGeometry;
Id: string;
Page?: number;
Relationships?: Relationship[];
}


}

export { };
Expand Down
Loading