Skip to content

Commit c3b42cf

Browse files
authored
Merge pull request #85 from RasmusAChr/Version-1.3.0
Version 1.3.0
2 parents a3d519f + 4ae8f24 commit c3b42cf

File tree

10 files changed

+646
-527
lines changed

10 files changed

+646
-527
lines changed

PdfProcessor.ts

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
import { App, Editor, Notice, normalizePath, FileManager } from 'obsidian';
2+
import { PluginSettings } from './settings';
3+
import { extractHeader, getAttachmentFolderPath, insertImageLink, imageSeparator } from './utils';
4+
5+
export class PdfProcessor {
6+
constructor(
7+
private app: App, // Obsidian App instance
8+
private pdfjsLib: any, // PDF.js library instance
9+
private settings: PluginSettings, // Plugin settings
10+
private fileManager: FileManager // File manager instance
11+
) {}
12+
13+
async process(editor: Editor, file: File, imageQuality: number) {
14+
15+
// Initialize progress notice
16+
let progressNotice: Notice | null = null;
17+
18+
try {
19+
const arrayBuffer = await file.arrayBuffer(); // Read PDF file as array buffer
20+
const typedArray = new Uint8Array(arrayBuffer); // Convert to typed array which is required by PDF.js
21+
const pdf = await this.pdfjsLib.getDocument({ data: typedArray }).promise; // Load PDF document
22+
const totalPages = pdf.numPages; // Get total number of pages
23+
const initialCursor = { ...editor.getCursor() }; // Save initial cursor position
24+
let insertPosition = { ...initialCursor }; // Initialize insert position for procedural insertion
25+
26+
// --- 1. Setup Folder Structure ---
27+
const pdfName = file.name.replace('.pdf', ''); // Remove .pdf extension from file name
28+
let cleanPdfName = pdfName.replace(/#/g, ''); // Clean name to avoid issues with folder names
29+
let folderIndex = 0; // Initial folder index for uniqueness
30+
let folderPath = normalizePath(`${await getAttachmentFolderPath(this.fileManager)}/${cleanPdfName}`); // Initial folder path
31+
32+
// If folder with same name exists, append index to make it unique
33+
while (await this.app.vault.adapter.exists(folderPath)) {
34+
folderIndex++;
35+
folderPath = normalizePath(`${await getAttachmentFolderPath(this.fileManager)}/${cleanPdfName}_${folderIndex}`);
36+
}
37+
await this.app.vault.createFolder(folderPath); // Create the unique folder
38+
39+
// --- 2. Processing Setup ---
40+
const imageLinks: string[] = []; // Array to hold generated image links
41+
42+
// Determine concurrency limit based on settings and total pages
43+
// Concurrency limit cannot exceed total pages to avoid unnecessary overhead
44+
const CONCURRENCY_LIMIT = Math.min(
45+
totalPages,
46+
this.settings.maxConcurrentPages,
47+
);
48+
49+
let completedPages = 0; // Counter for completed pages
50+
let lastExtractedHeader: string | null = null; // For duplicate header checking
51+
52+
progressNotice = new Notice(`Processing PDF: ${completedPages}/${totalPages} pages`, 0); // Update notice to show start of progress
53+
54+
// --- 3. Define the heavy lifting function ---
55+
const processSinglePage = async (pageNum: number) => {
56+
const page = await pdf.getPage(pageNum); // Get the page to process
57+
const qualityToUse = imageQuality ?? this.settings.imageResolution; // Determine quality to use
58+
const viewport = page.getViewport({ scale: qualityToUse }); // Get viewport at desired scale
59+
60+
// Get original dimensions to show with 100% width
61+
const originalViewport = page.getViewport({ scale: 1.0 });
62+
const displayWidth = Math.round(originalViewport.width);
63+
64+
// Canvas and context to render PDF page
65+
const canvas = document.createElement('canvas');
66+
const context = canvas.getContext('2d');
67+
if (!context) throw new Error('Failed to get canvas context');
68+
69+
canvas.height = viewport.height;
70+
canvas.width = viewport.width;
71+
72+
await page.render({ canvasContext: context, viewport }).promise;
73+
74+
// Convert canvas to Blob (image file)
75+
const blob = await new Promise<Blob>((resolve, reject) => {
76+
canvas.toBlob(blob => blob ? resolve(blob) : reject(new Error('Image blob failed')), `image/${this.settings.imageType}`, 0.9); // 0.9 is only for lossy formats
77+
});
78+
79+
// Explicitly clean up PDF.js resources (optimization)
80+
page.cleanup();
81+
82+
// Force browser to dump canvas bitmap (optimization)
83+
canvas.width = 0;
84+
canvas.height = 0;
85+
86+
const imageName = `page_${pageNum}.${this.settings.imageType}`; // Get image name
87+
const imagePath = `${folderPath}/${imageName}`; // Full path for image in vault
88+
const arrayBufferImg = await blob.arrayBuffer(); // Convert Blob to ArrayBuffer for Obsidian Vault
89+
90+
// File I/O - Create the image file in the vault
91+
await this.app.vault.createBinary(imagePath, arrayBufferImg);
92+
93+
// Header Extraction
94+
let rawHeader = '';
95+
if (this.settings.enableHeaders) {
96+
rawHeader = await extractHeader(page, this.settings.headerExtractionSensitive);
97+
}
98+
99+
completedPages++; // Increment completed pages
100+
if (progressNotice) { // Update progress notice
101+
progressNotice.setMessage(`Processing PDF: ${completedPages}/${totalPages} pages`);
102+
}
103+
104+
return {
105+
pageNum,
106+
imagePath,
107+
imageName,
108+
rawHeader,
109+
displayWidth,
110+
qualityToUse
111+
};
112+
};
113+
114+
// --- 4. Loop for processing pages (Chunked) ---
115+
for (let i = 1; i <= totalPages; i += CONCURRENCY_LIMIT) {
116+
const chunkPromises = [];
117+
118+
// Create a batch of promises
119+
for (let j = 0; j < CONCURRENCY_LIMIT && (i + j) <= totalPages; j++) {
120+
chunkPromises.push(processSinglePage(i + j)); // Process page i + j
121+
}
122+
123+
// Wait for the whole batch to finish
124+
const chunkResults = await Promise.all(chunkPromises);
125+
126+
// --- 5. Sequential Post-Processing (Ordered) ---
127+
for (const result of chunkResults) {
128+
let finalHeader = result.rawHeader;
129+
130+
// Duplicate header logic (sequentially) to ensure correctness
131+
// If duplicate header removal is enabled, compare with last extracted header
132+
if (this.settings.enableHeaders && this.settings.removeHeaderDuplicates) {
133+
if (finalHeader === lastExtractedHeader) { // Remove duplicate if same as last
134+
finalHeader = '';
135+
} else { // Update last extracted header to current if not duplicate
136+
lastExtractedHeader = finalHeader;
137+
}
138+
}
139+
140+
// Build the link string
141+
let imageLink = '';
142+
// Adjust image display width based on quality settings
143+
if (result.qualityToUse < 1.0) {
144+
imageLink = `${finalHeader ? `${this.settings.headerSize} ${finalHeader}\n` : ''}![${result.imageName}|${result.displayWidth}](${encodeURI(result.imagePath)})`;
145+
} else {
146+
imageLink = `${finalHeader ? `${this.settings.headerSize} ${finalHeader}\n` : ''}![${result.imageName}](${encodeURI(result.imagePath)})`;
147+
}
148+
149+
// Insert or store based on insertion method
150+
if (this.settings.insertionMethod === 'Procedural') {
151+
insertPosition = insertImageLink(editor, insertPosition, imageLink, this.settings.imageSeparator);
152+
} else {
153+
imageLinks.push(imageLink); // Store the generated image link until all pages are processed
154+
}
155+
}
156+
}
157+
158+
// --- 6. Batch Insert ---
159+
if (this.settings.insertionMethod === 'Batch') {
160+
let separator = imageSeparator(this.settings.imageSeparator);
161+
const allImageLinks = imageLinks.join(separator);
162+
const scrollInfo = editor.getScrollInfo();
163+
const cursor = initialCursor;
164+
editor.replaceRange(allImageLinks, cursor);
165+
editor.scrollTo(scrollInfo.left, scrollInfo.top);
166+
}
167+
168+
new Notice('PDF processing complete'); // Final completion notice
169+
170+
} catch (error) {
171+
console.error(error);
172+
new Notice('Failed to process PDF: ' + (error instanceof Error ? error.message : 'Unknown error'));
173+
} finally {
174+
if (progressNotice) {
175+
progressNotice.hide();
176+
}
177+
}
178+
}
179+
}

README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Convert PDF pages into images and insert them directly into your Obsidian notes. Optionally, extract headers from each page and add or empty lines below each image for quick and easy-to-annotate notes.
44

55
## Demo
6-
![demo](https://github.com/RasmusAChr/PDF2Images/blob/master/resources/demo.gif?raw=true)
6+
![demo](https://github.com/RasmusAChr/PDF2Images/blob/master/resources/demo2.gif?raw=true)
77

88
## How to use
99

@@ -16,12 +16,16 @@ Convert PDF pages into images and insert them directly into your Obsidian notes.
1616

1717
## Settings
1818
- **Image Quality**: Adjust the image quality to suit your needs. The default setting is 1x, but you can reduce it to as low as 0.5x for smaller file sizes and improved performance, or increase it up to 2x for the highest image clarity.
19-
19+
- **Image Type**: Choose the format for the inserted images. Options include:
20+
- WEBP: Modern format that balances quality and file size effectively.
21+
- JPEG: Lossy format, suitable for smaller file sizes.
22+
- PNG: Lossless format, best for high-quality images.
23+
2024
- **Image Insertion Method**: Choose between two different methods for inserting images:
2125
- Procedural: Images are generated and inserted one at a time.
2226
- Batch: All images are generated first, then inserted simultaneously for a more streamlined process.
2327

24-
- **Image seperator**: You can customize how images are separated: choose an empty line, a separator line, or no separation at all.
28+
- **Image separator**: You can customize how images are separated: choose an empty line, a separator line, or no separation at all.
2529

2630
- **Insert headers**: Toggle the option on to generate headers for each image based on pdf page analysis. This feature is in early development, so its effectiveness may vary depending on the PDF layout. I’m actively working to improve the algorithm to better detect different header types.
2731

@@ -33,6 +37,7 @@ Convert PDF pages into images and insert them directly into your Obsidian notes.
3337
- Increase this value if headers are not being detected correctly.
3438
- Lower the value if non-headers are mistakenly being detected as headers.
3539

40+
- **Max Concurrent Pages**: Set the maximum number of pages to process concurrently. This can help improve performance but may also increase memory usage.
3641

3742
## Support
3843
This plugin is free for everyone. If you'd like to show your appreciation or support further development, feel free to send a contribution my way:<br>

0 commit comments

Comments
 (0)