From 8c37eb3e8debeaa54ea64b89c53789c0397a2477 Mon Sep 17 00:00:00 2001 From: Aspose-Tester Date: Fri, 8 May 2026 14:45:18 +0500 Subject: [PATCH] Add 82 example(s) for facades-extract-images-and-text --- facades-extract-images-and-text/agents.md | 145 ++ .../async-pdf-extraction.cs | 127 ++ ...batch-extract-text-from-pdfs-azure-blob.cs | 130 ++ .../batch-extract-text-from-pdfs.cs | 48 + .../batch-extract-text-from-pdfs__v2.cs | 43 + .../check-pdf-contains-text.cs | 39 + .../check-pdf-for-text-and-images.cs | 61 + ...check-pdf-text-only-by-detecting-images.cs | 39 + .../configurable-pdf-extraction.cs | 126 ++ .../count-pages-images-attachments-pdf.cs | 57 + .../create-contact-sheet-pdf.cs | 109 + .../extract-all-images-from-pdf.cs | 48 + .../extract-first-three-pages-summary-pdf.cs | 58 + .../extract-images-by-keyword.cs | 79 + .../extract-images-create-pdfa2b.cs | 60 + .../extract-images-create-portfolio-pdf.cs | 73 + .../extract-images-create-sprite-sheet.cs | 118 ++ .../extract-images-from-encrypted-pdf.cs | 46 + .../extract-images-from-first-pdf-page.cs | 58 + ...xtract-images-from-pdf-and-compress-png.cs | 66 + .../extract-images-from-pdf-csv.cs | 75 + .../extract-images-from-pdf-pages-png.cs | 49 + .../extract-images-from-pdf-to-gcs.cs | 88 + .../extract-images-from-pdf-to-zip.cs | 52 + ...ract-images-from-pdf-using-pdfextractor.cs | 50 + ...act-images-from-pdf-with-guid-filenames.cs | 46 + .../extract-images-from-pdf.cs | 52 + .../extract-images-from-specific-pdf-page.cs | 53 + .../extract-images-markdown-gallery.cs | 58 + .../extract-images-ocr-openai.cs | 92 + .../extract-images-original-format.cs | 52 + .../extract-images-pdf-csv-manifest.cs | 65 + .../extract-images-pdf-to-base64-json.cs | 74 + .../extract-images-pdf-to-word.cs | 307 +++ .../extract-images-pdf-upload-s3.cs | 115 + .../extract-images-to-html-report.cs | 76 + .../extract-images-to-tiff.cs | 57 + .../extract-images-to-unc-share.cs | 64 + .../extract-images-to-unc-share__v2.cs | 53 + .../extract-images-validate-signature.cs | 116 ++ .../extract-pdf-attachments-by-extension.cs | 80 + ...xtract-pdf-attachments-rename-timestamp.cs | 73 + .../extract-pdf-attachments-sha256.cs | 79 + .../extract-pdf-attachments.cs | 59 + .../extract-pdf-images-add-watermark.cs | 93 + .../extract-pdf-images-as-bmp.cs | 47 + .../extract-pdf-images-base64.cs | 103 + ...tract-pdf-images-create-video-slideshow.cs | 104 + .../extract-pdf-images-html-gallery.cs | 75 + .../extract-pdf-images-to-azure-blob.cs | 115 + .../extract-pdf-images-to-jpeg-quality-85.cs | 56 + .../extract-pdf-images-to-s3-with-sse.cs | 123 ++ .../extract-pdf-pages-as-png-thumbnails.cs | 43 + .../extract-pdf-text-azure-function.cs | 175 ++ .../extract-pdf-text-page-by-page-progress.cs | 58 + .../extract-pdf-text-page-by-page.cs | 48 + .../extract-pdf-text-to-azure-table.cs | 119 ++ .../extract-pdf-text-to-gzip.cs | 40 + .../extract-pdf-text-to-postgres.cs | 118 ++ .../extract-pdf-text-to-stringbuilder.cs | 55 + .../extract-pdf-text-to-stringwriter.cs | 49 + .../extract-pdf-text-to-temp-file-verify.cs | 56 + .../extract-pdf-text-to-utf8-file.cs | 42 + .../extract-pdf-text-with-cancellation.cs | 88 + .../extract-resource-defined-images.cs | 46 + .../extract-text-and-images-from-pdf.cs | 60 + ...xtract-text-from-password-protected-pdf.cs | 30 + .../extract-text-from-pdf-and-encrypt-aes.cs | 94 + .../extract-text-from-pdf-byte-array.cs | 49 + .../extract-text-from-pdf-bytes.cs | 97 + ...tract-text-from-pdf-pages-with-progress.cs | 51 + .../extract-text-from-pdf-to-utf8-txt.cs | 41 + ...xtract-text-from-pdf-using-pdfextractor.cs | 49 + .../extract-text-from-pdf.cs | 47 + ...tract-text-from-pdfs-using-pdfextractor.cs | 100 + .../extract-text-only-from-pdf.cs | 64 + ...tract-text-to-memorystream-generate-pdf.cs | 69 + .../handle-errors-binding-corrupted-pdf.cs | 54 + facades-extract-images-and-text/index.json | 1854 +++++++++++++++++ .../list-embedded-attachment-names.cs | 30 + ...extraction-of-images-from-multiple-pdfs.cs | 59 + ...pdf-extractor-text-extraction-unit-test.cs | 103 + .../retry-pdf-text-extraction.cs | 58 + .../verify-image-extraction-mode-count.cs | 161 ++ 84 files changed, 8138 insertions(+) create mode 100644 facades-extract-images-and-text/agents.md create mode 100644 facades-extract-images-and-text/async-pdf-extraction.cs create mode 100644 facades-extract-images-and-text/batch-extract-text-from-pdfs-azure-blob.cs create mode 100644 facades-extract-images-and-text/batch-extract-text-from-pdfs.cs create mode 100644 facades-extract-images-and-text/batch-extract-text-from-pdfs__v2.cs create mode 100644 facades-extract-images-and-text/check-pdf-contains-text.cs create mode 100644 facades-extract-images-and-text/check-pdf-for-text-and-images.cs create mode 100644 facades-extract-images-and-text/check-pdf-text-only-by-detecting-images.cs create mode 100644 facades-extract-images-and-text/configurable-pdf-extraction.cs create mode 100644 facades-extract-images-and-text/count-pages-images-attachments-pdf.cs create mode 100644 facades-extract-images-and-text/create-contact-sheet-pdf.cs create mode 100644 facades-extract-images-and-text/extract-all-images-from-pdf.cs create mode 100644 facades-extract-images-and-text/extract-first-three-pages-summary-pdf.cs create mode 100644 facades-extract-images-and-text/extract-images-by-keyword.cs create mode 100644 facades-extract-images-and-text/extract-images-create-pdfa2b.cs create mode 100644 facades-extract-images-and-text/extract-images-create-portfolio-pdf.cs create mode 100644 facades-extract-images-and-text/extract-images-create-sprite-sheet.cs create mode 100644 facades-extract-images-and-text/extract-images-from-encrypted-pdf.cs create mode 100644 facades-extract-images-and-text/extract-images-from-first-pdf-page.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf-and-compress-png.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf-csv.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf-pages-png.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf-to-gcs.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf-to-zip.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf-using-pdfextractor.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf-with-guid-filenames.cs create mode 100644 facades-extract-images-and-text/extract-images-from-pdf.cs create mode 100644 facades-extract-images-and-text/extract-images-from-specific-pdf-page.cs create mode 100644 facades-extract-images-and-text/extract-images-markdown-gallery.cs create mode 100644 facades-extract-images-and-text/extract-images-ocr-openai.cs create mode 100644 facades-extract-images-and-text/extract-images-original-format.cs create mode 100644 facades-extract-images-and-text/extract-images-pdf-csv-manifest.cs create mode 100644 facades-extract-images-and-text/extract-images-pdf-to-base64-json.cs create mode 100644 facades-extract-images-and-text/extract-images-pdf-to-word.cs create mode 100644 facades-extract-images-and-text/extract-images-pdf-upload-s3.cs create mode 100644 facades-extract-images-and-text/extract-images-to-html-report.cs create mode 100644 facades-extract-images-and-text/extract-images-to-tiff.cs create mode 100644 facades-extract-images-and-text/extract-images-to-unc-share.cs create mode 100644 facades-extract-images-and-text/extract-images-to-unc-share__v2.cs create mode 100644 facades-extract-images-and-text/extract-images-validate-signature.cs create mode 100644 facades-extract-images-and-text/extract-pdf-attachments-by-extension.cs create mode 100644 facades-extract-images-and-text/extract-pdf-attachments-rename-timestamp.cs create mode 100644 facades-extract-images-and-text/extract-pdf-attachments-sha256.cs create mode 100644 facades-extract-images-and-text/extract-pdf-attachments.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-add-watermark.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-as-bmp.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-base64.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-create-video-slideshow.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-html-gallery.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-to-azure-blob.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-to-jpeg-quality-85.cs create mode 100644 facades-extract-images-and-text/extract-pdf-images-to-s3-with-sse.cs create mode 100644 facades-extract-images-and-text/extract-pdf-pages-as-png-thumbnails.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-azure-function.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-page-by-page-progress.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-page-by-page.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-to-azure-table.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-to-gzip.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-to-postgres.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-to-stringbuilder.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-to-stringwriter.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-to-temp-file-verify.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-to-utf8-file.cs create mode 100644 facades-extract-images-and-text/extract-pdf-text-with-cancellation.cs create mode 100644 facades-extract-images-and-text/extract-resource-defined-images.cs create mode 100644 facades-extract-images-and-text/extract-text-and-images-from-pdf.cs create mode 100644 facades-extract-images-and-text/extract-text-from-password-protected-pdf.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdf-and-encrypt-aes.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdf-byte-array.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdf-bytes.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdf-pages-with-progress.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdf-to-utf8-txt.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdf-using-pdfextractor.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdf.cs create mode 100644 facades-extract-images-and-text/extract-text-from-pdfs-using-pdfextractor.cs create mode 100644 facades-extract-images-and-text/extract-text-only-from-pdf.cs create mode 100644 facades-extract-images-and-text/extract-text-to-memorystream-generate-pdf.cs create mode 100644 facades-extract-images-and-text/handle-errors-binding-corrupted-pdf.cs create mode 100644 facades-extract-images-and-text/index.json create mode 100644 facades-extract-images-and-text/list-embedded-attachment-names.cs create mode 100644 facades-extract-images-and-text/parallel-extraction-of-images-from-multiple-pdfs.cs create mode 100644 facades-extract-images-and-text/pdf-extractor-text-extraction-unit-test.cs create mode 100644 facades-extract-images-and-text/retry-pdf-text-extraction.cs create mode 100644 facades-extract-images-and-text/verify-image-extraction-mode-count.cs diff --git a/facades-extract-images-and-text/agents.md b/facades-extract-images-and-text/agents.md new file mode 100644 index 00000000..74cd1b6a --- /dev/null +++ b/facades-extract-images-and-text/agents.md @@ -0,0 +1,145 @@ +--- +name: facades-extract-images-and-text +description: C# examples for facades-extract-images-and-text using Aspose.PDF for .NET +language: csharp +framework: net10.0 +parent: ../agents.md +--- + +# AGENTS - facades-extract-images-and-text + +## Persona + +You are a C# developer specializing in PDF processing using Aspose.PDF for .NET, +working within the **facades-extract-images-and-text** category. +This folder contains standalone C# examples for facades-extract-images-and-text operations. +See the root [agents.md](../agents.md) for repository-wide conventions and boundaries. + +## Scope +- This folder contains examples for **facades-extract-images-and-text**. +- Files are standalone `.cs` examples stored directly in this folder. + +## Required Namespaces + +- `using Aspose.Pdf.Facades;` (82/82 files) ← category-specific +- `using Aspose.Pdf;` (23/82 files) +- `using Aspose.Pdf.Text;` (4/82 files) +- `using Aspose.Pdf.AI;` (1/82 files) +- `using Aspose.Pdf.Drawing;` (1/82 files) +- `using System;` (82/82 files) +- `using System.IO;` (81/82 files) +- `using System.Text;` (25/82 files) +- `using System.Drawing.Imaging;` (20/82 files) +- `using System.Collections.Generic;` (16/82 files) +- `using System.Threading.Tasks;` (7/82 files) +- `using System.Text.Json;` (4/82 files) +- `using Azure.Storage.Blobs;` (3/82 files) +- `using System.IO.Compression;` (3/82 files) +- `using System.Threading;` (3/82 files) +- `using System.Drawing;` (2/82 files) +- `using System.Linq;` (2/82 files) +- `using System.Security.Cryptography;` (2/82 files) +- `using Azure.Storage.Blobs.Models;` (1/82 files) +- `using DocumentFormat.OpenXml;` (1/82 files) +- `using DocumentFormat.OpenXml.Drawing;` (1/82 files) +- `using DocumentFormat.OpenXml.Drawing.Pictures;` (1/82 files) +- `using DocumentFormat.OpenXml.Drawing.Wordprocessing;` (1/82 files) +- `using DocumentFormat.OpenXml.Packaging;` (1/82 files) +- `using DocumentFormat.OpenXml.Wordprocessing;` (1/82 files) +- `using Microsoft.Azure.WebJobs;` (1/82 files) +- `using Microsoft.Extensions.Logging;` (1/82 files) +- `using NUnit.Framework;` (1/82 files) +- `using System.Diagnostics;` (1/82 files) +- `using System.Runtime.CompilerServices;` (1/82 files) +- `using System.Runtime.InteropServices;` (1/82 files) +- `using System.Runtime.Versioning;` (1/82 files) + +## Common Code Pattern + +Most files in this category use `PdfExtractor` from `Aspose.Pdf.Facades`: + +```csharp +PdfExtractor tool = new PdfExtractor(); +tool.BindPdf("input.pdf"); +// ... PdfExtractor operations ... +tool.Save("output.pdf"); +``` + +## Files in this folder + +| File | Title | Key APIs | Description | +|------|-------|----------|-------------| +| [async-pdf-extraction](./async-pdf-extraction.cs) | Asynchronous PDF Text, Image, and Attachment Extraction | `PdfExtractor`, `BindPdf`, `ExtractText` | Demonstrates how to extract text, images, and embedded attachments from a PDF file asynchronously... | +| [batch-extract-text-from-pdfs-azure-blob](./batch-extract-text-from-pdfs-azure-blob.cs) | Batch Extract Text from PDFs in Azure Blob Storage | `PdfExtractor`, `BindPdf`, `ExtractText` | Demonstrates how to enumerate PDF files in an Azure Blob container, extract their text using Aspo... | +| [batch-extract-text-from-pdfs](./batch-extract-text-from-pdfs.cs) | Batch Extract Text from PDFs | `PdfExtractor`, `BindPdf`, `ExtractText` | Shows how to loop through a folder of PDF files, use Aspose.Pdf.Facades.PdfExtractor to extract t... | +| [batch-extract-text-from-pdfs__v2](./batch-extract-text-from-pdfs__v2.cs) | Batch Extract Text from PDFs using PdfExtractor | `PdfExtractor`, `BindPdf`, `ExtractText` | Shows how to use Aspose.Pdf.Facades.PdfExtractor to extract text from every PDF in a directory an... | +| [check-pdf-contains-text](./check-pdf-contains-text.cs) | Check if PDF Contains Text via MemoryStream | `PdfExtractor`, `BindPdf`, `ExtractText` | Demonstrates using Aspose.Pdf's PdfExtractor to extract text into a MemoryStream and determine wh... | +| [check-pdf-for-text-and-images](./check-pdf-for-text-and-images.cs) | Check PDF for Both Text and Images | `PdfExtractor`, `BindPdf`, `ExtractText` | Demonstrates how to use Aspose.Pdf's PdfExtractor to determine whether a PDF file contains at lea... | +| [check-pdf-text-only-by-detecting-images](./check-pdf-text-only-by-detecting-images.cs) | Check if PDF Is Text‑Only by Detecting Images | `PdfExtractor`, `BindPdf`, `ExtractImage` | The example uses Aspose.Pdf.Facades.PdfExtractor to bind a PDF, extract its images, and then chec... | +| [configurable-pdf-extraction](./configurable-pdf-extraction.cs) | Configurable PDF Text, Image, and Attachment Extraction | `PdfExtractor`, `BindPdf`, `ExtractText` | Demonstrates reading a JSON configuration to toggle extraction of text, images, and attachments f... | +| [count-pages-images-attachments-pdf](./count-pages-images-attachments-pdf.cs) | Count Pages, Images, and Attachments in a PDF | `Document`, `PdfExtractor`, `Count` | Demonstrates how to use Aspose.Pdf to obtain the total number of pages, embedded images, and file... | +| [create-contact-sheet-pdf](./create-contact-sheet-pdf.cs) | Create Contact Sheet PDF from Extracted Images | `PdfExtractor`, `BindPdf`, `ExtractImage` | Extracts all images from a source PDF and generates a new PDF that displays those images as thumb... | +| [extract-all-images-from-pdf](./extract-all-images-from-pdf.cs) | Extract All Images from PDF Using PdfExtractor | `PdfExtractor`, `BindPdf`, `StartPage` | Demonstrates how to extract every image from a PDF document by setting the page range to all page... | +| [extract-first-three-pages-summary-pdf](./extract-first-three-pages-summary-pdf.cs) | Extract First Three Pages Text and Create Summary PDF | `PdfExtractor`, `Document`, `Page` | Shows how to extract text from the first three pages of a PDF using PdfExtractor (Facades API) an... | +| [extract-images-by-keyword](./extract-images-by-keyword.cs) | Extract Images from Pages Containing a Keyword | `Document`, `PdfExtractor`, `BindPdf` | Shows how to scan each PDF page for a specific keyword and, when the keyword is found, extract al... | +| [extract-images-create-pdfa2b](./extract-images-create-pdfa2b.cs) | Extract Images and Create PDF/A‑2b Document | `PdfExtractor`, `BindPdf`, `ExtractImage` | Shows how to extract images from a PDF using PdfExtractor, embed each image as an XObject on a ne... | +| [extract-images-create-portfolio-pdf](./extract-images-create-portfolio-pdf.cs) | Extract Images from PDF and Create Portfolio PDF | `PdfExtractor`, `BindPdf`, `ExtractImage` | Demonstrates extracting all images from a source PDF using Aspose.Pdf's PdfExtractor and assembli... | +| [extract-images-create-sprite-sheet](./extract-images-create-sprite-sheet.cs) | Extract Images from PDF and Create Sprite Sheet PNG | `PdfExtractor`, `BindPdf`, `ExtractImage` | Shows how to extract all images from a PDF using PdfExtractor and merge them into a single horizo... | +| [extract-images-from-encrypted-pdf](./extract-images-from-encrypted-pdf.cs) | Extract Images from Encrypted PDF using PdfExtractor | `PdfExtractor`, `Password`, `BindPdf` | Shows how to provide a user password to Aspose.Pdf.Facades.PdfExtractor and extract all images fr... | +| [extract-images-from-first-pdf-page](./extract-images-from-first-pdf-page.cs) | Extract Images from First PDF Page to Byte Arrays | `PdfExtractor`, `BindPdf`, `StartPage` | Demonstrates how to use Aspose.Pdf's PdfExtractor facade to pull all images from the first page o... | +| [extract-images-from-pdf-and-compress-png](./extract-images-from-pdf-and-compress-png.cs) | Extract Images from PDF and Compress PNGs | `PdfExtractor`, `BindPdf`, `ExtractImage` | The example demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to extract images from a PDF,... | +| [extract-images-from-pdf-csv](./extract-images-from-pdf-csv.cs) | Extract Images from PDF and Generate CSV Report | `PdfExtractor`, `BindPdf`, `ExtractImage` | Shows how to use Aspose.Pdf.Facades.PdfExtractor to extract images from each PDF page, save them ... | +| [extract-images-from-pdf-pages-png](./extract-images-from-pdf-pages-png.cs) | Extract Images from Specific PDF Pages as PNG | `PdfExtractor`, `BindPdf`, `StartPage` | Demonstrates how to configure PdfExtractor to extract only images from pages 5‑10 of a PDF and sa... | +| [extract-images-from-pdf-to-gcs](./extract-images-from-pdf-to-gcs.cs) | Extract Images from PDF and Upload to Google Cloud Storage | `PdfExtractor`, `BindPdf`, `ExtractImage` | Shows how to use Aspose.Pdf.Facades.PdfExtractor to pull images from a PDF and then upload each i... | +| [extract-images-from-pdf-to-zip](./extract-images-from-pdf-to-zip.cs) | Extract Images from PDF to ZIP Archive | `PdfExtractor`, `BindPdf`, `ExtractImage` | Shows how to use Aspose.Pdf.Facades.PdfExtractor to pull all images out of a PDF and then package... | +| [extract-images-from-pdf-using-pdfextractor](./extract-images-from-pdf-using-pdfextractor.cs) | Extract Images from PDF Using PdfExtractor with Automatic Di... | `PdfExtractor`, `BindPdf`, `ExtractImage` | Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor inside a using block to extract all image... | +| [extract-images-from-pdf-with-guid-filenames](./extract-images-from-pdf-with-guid-filenames.cs) | Extract Images from PDF with GUID Filenames | `PdfExtractor`, `BindPdf`, `ExtractImage` | Demonstrates how to extract all images from a PDF using Aspose.Pdf.Facades.PdfExtractor and save ... | +| [extract-images-from-pdf](./extract-images-from-pdf.cs) | Extract Images from PDF to Temporary Folder | `PdfExtractor`, `BindPdf`, `ExtractImage` | Demonstrates how to use Aspose.Pdf's PdfExtractor to pull all images from a PDF and save them int... | +| [extract-images-from-specific-pdf-page](./extract-images-from-specific-pdf-page.cs) | Extract Images from a Specific PDF Page | `PdfExtractor`, `BindPdf`, `StartPage` | Shows how to extract all images from a single PDF page by setting the PdfExtractor's StartPage an... | +| [extract-images-markdown-gallery](./extract-images-markdown-gallery.cs) | Extract Images from PDF and Generate Markdown Gallery | `PdfExtractor`, `BindPdf`, `ExtractImage` | Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to pull images from a PDF, save them as P... | +| [extract-images-ocr-openai](./extract-images-ocr-openai.cs) | Extract Images from PDF and Perform OCR with OpenAI | `BindPdf`, `ExtractImage`, `HasNextImage` | Demonstrates extracting all images from a PDF using PdfExtractor and then applying Aspose.Pdf.AI ... | +| [extract-images-original-format](./extract-images-original-format.cs) | Extract Images from PDF in Original Format | `PdfExtractor`, `ExtractImageMode`, `BindPdf` | Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to extract all images from a PDF while pr... | +| ... | | | *and 52 more files* | + +## Category Statistics +- Total examples: 82 + +## Category-Specific Tips + +### Key API Surface +- `Aspose.Pdf.Facades.ExtractImageMode` +- `Aspose.Pdf.Facades.PdfContentEditor` +- `Aspose.Pdf.Facades.PdfConverter` +- `Aspose.Pdf.Facades.PdfExtractor` +- `Aspose.Pdf.Facades.PdfExtractor.BindPdf` +- `Aspose.Pdf.Facades.PdfExtractor.ExtractText` +- `Aspose.Pdf.Facades.PdfExtractor.GetNextPageText` +- `Aspose.Pdf.Facades.PdfExtractor.HasNextPageText` +- `Aspose.Pdf.Facades.PdfFileEditor` +- `Aspose.Pdf.Facades.PdfFileEditor.Extract` + +### Rules +- BindPdf({input_pdf}) must be called on a PdfContentEditor instance before any editing methods such as ReplaceText. +- ReplaceText({text_fragment}, {page}, {text_fragment}) replaces all occurrences of the first text fragment on the specified 1‑based page with the second text fragment. +- Save({output_pdf}) persists the edited PDF; it should be invoked after all edit operations are completed. +- Use PdfFileEditor.Extract({input_pdf}, new int[] {{int}, {int}, ...}, {output_pdf}) to create a new PDF containing only the listed pages. +- Page numbers supplied in the int array are 1‑based and must exist in {input_pdf}. + +### Warnings +- Page numbers are 1‑based; passing 0 will cause an error. +- ReplaceText operates only on the specified page and replaces every matching occurrence on that page. +- The output file will be created or overwritten; ensure the path is correct. +- The example assumes the input PDF exists at the specified location. +- The example does not explicitly dispose the FileStream objects; callers should ensure streams are closed or wrapped in using statements. + +## General Tips +- See parent [agents.md](../agents.md) for: + - **Boundaries** — Always / Ask First / Never rules for all examples + - **Common Mistakes** — verified anti-patterns that cause build failures + - **Domain Knowledge** — cross-cutting API-specific gotchas + - **Testing Guide** — build and run verification steps +- Review code examples in this folder for facades-extract-images-and-text patterns + + +Updated: 2026-05-08 | Run: `20260508_144436_050a95` + diff --git a/facades-extract-images-and-text/async-pdf-extraction.cs b/facades-extract-images-and-text/async-pdf-extraction.cs new file mode 100644 index 00000000..eb7599aa --- /dev/null +++ b/facades-extract-images-and-text/async-pdf-extraction.cs @@ -0,0 +1,127 @@ +using System; +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using Aspose.Pdf.Facades; +using System.Drawing.Imaging; + +class PdfAsyncExtractor +{ + // Asynchronously extracts all text from a PDF and saves it to a single .txt file. + public static async Task ExtractTextAsync(string pdfPath, string outputTextPath, CancellationToken cancellationToken = default) + { + // Ensure the source PDF exists. + if (!File.Exists(pdfPath)) + throw new FileNotFoundException($"PDF file not found: {pdfPath}"); + + // PdfExtractor implements IDisposable, so wrap it in a using block. + using (var extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor (synchronous operation). + extractor.BindPdf(pdfPath); + + // Run the extraction on a background thread to avoid blocking the UI thread. + await Task.Run(() => + { + // Extract the text content. + extractor.ExtractText(); + + // Save the extracted text to the specified file. + extractor.GetText(outputTextPath); + }, cancellationToken).ConfigureAwait(false); + } + } + + // Asynchronously extracts all images from a PDF and saves them to the specified folder. + public static async Task ExtractImagesAsync(string pdfPath, string outputFolder, CancellationToken cancellationToken = default) + { + if (!File.Exists(pdfPath)) + throw new FileNotFoundException($"PDF file not found: {pdfPath}"); + + // Create the output folder if it does not exist. + Directory.CreateDirectory(outputFolder); + + using (var extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); + + await Task.Run(() => + { + // Start image extraction. + extractor.ExtractImage(); + + int imageIndex = 1; + // Retrieve each image until no more are available. + while (extractor.HasNextImage()) + { + // Build a file name for the image. + string imagePath = Path.Combine(outputFolder, $"image-{imageIndex}.png"); + + // Save the image. Using GetNextImage(string) saves in the default format (PNG). + extractor.GetNextImage(imagePath); + + imageIndex++; + } + }, cancellationToken).ConfigureAwait(false); + } + } + + // Asynchronously extracts all embedded file attachments from a PDF and saves them to the specified folder. + public static async Task ExtractAttachmentsAsync(string pdfPath, string outputFolder, CancellationToken cancellationToken = default) + { + if (!File.Exists(pdfPath)) + throw new FileNotFoundException($"PDF file not found: {pdfPath}"); + + Directory.CreateDirectory(outputFolder); + + using (var extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); + + await Task.Run(() => + { + // Extract all attachments. + extractor.ExtractAttachment(); + + // Save each attachment to the output folder. + extractor.GetAttachment(outputFolder); + }, cancellationToken).ConfigureAwait(false); + } + } + + // Example usage of the asynchronous extraction methods. + static async Task Main(string[] args) + { + // Paths (adjust as needed). + string pdfFile = "sample.pdf"; + string textOutput = "sample_text.txt"; + string imagesOutputFolder = "ExtractedImages"; + string attachmentsOutputFolder = "ExtractedAttachments"; + + // Cancellation token source (optional, can be used to cancel operations). + using var cts = new CancellationTokenSource(); + + try + { + // Extract text. + await ExtractTextAsync(pdfFile, textOutput, cts.Token); + Console.WriteLine($"Text extracted to: {textOutput}"); + + // Extract images. + await ExtractImagesAsync(pdfFile, imagesOutputFolder, cts.Token); + Console.WriteLine($"Images extracted to folder: {imagesOutputFolder}"); + + // Extract attachments. + await ExtractAttachmentsAsync(pdfFile, attachmentsOutputFolder, cts.Token); + Console.WriteLine($"Attachments extracted to folder: {attachmentsOutputFolder}"); + } + catch (OperationCanceledException) + { + Console.WriteLine("Extraction was canceled."); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error during extraction: {ex.Message}"); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/batch-extract-text-from-pdfs-azure-blob.cs b/facades-extract-images-and-text/batch-extract-text-from-pdfs-azure-blob.cs new file mode 100644 index 00000000..61e2c309 --- /dev/null +++ b/facades-extract-images-and-text/batch-extract-text-from-pdfs-azure-blob.cs @@ -0,0 +1,130 @@ +using System; +using System.IO; +using System.Text; +using System.Threading.Tasks; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using Azure.Storage.Blobs; +using Azure.Storage.Blobs.Models; +using Aspose.Pdf.Facades; // PdfExtractor resides here + +class Program +{ + // Adjust these constants as needed. + private const string BlobConnectionString = ""; + private const string InputContainerName = "pdf-input"; + private const string OutputContainerName = "text-output"; + + static async Task Main() + { + // Initialize blob service client. + BlobServiceClient serviceClient = new BlobServiceClient(BlobConnectionString); + BlobContainerClient inputContainer = serviceClient.GetBlobContainerClient(InputContainerName); + BlobContainerClient outputContainer = serviceClient.GetBlobContainerClient(OutputContainerName); + + // Ensure the output container exists. + await outputContainer.CreateIfNotExistsAsync(); + + // List all PDF blobs in the input container. + await foreach (BlobItem blobItem in inputContainer.GetBlobsAsync()) + { + if (!blobItem.Name.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)) + continue; // Skip non‑PDF files. + + Console.WriteLine($"Processing: {blobItem.Name}"); + + // Download the PDF into a memory stream. + BlobClient pdfBlob = inputContainer.GetBlobClient(blobItem.Name); + using (MemoryStream pdfStream = new MemoryStream()) + { + await pdfBlob.DownloadToAsync(pdfStream); + pdfStream.Position = 0; // Reset for reading. + + // Extract text using Aspose.Pdf.Facades.PdfExtractor. + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfStream); // Bind the PDF stream. + extractor.ExtractText(); // Perform extraction. + + // Retrieve extracted text into a string. + using (MemoryStream textStream = new MemoryStream()) + { + extractor.GetText(textStream); // Save extracted text to stream. + string extractedText = Encoding.UTF8.GetString(textStream.ToArray()); + + // Prepare the output blob name (same as PDF but .txt extension). + string txtBlobName = Path.ChangeExtension(blobItem.Name, ".txt"); + BlobClient txtBlob = outputContainer.GetBlobClient(txtBlobName); + + // Upload the extracted text. + using (MemoryStream uploadStream = new MemoryStream(Encoding.UTF8.GetBytes(extractedText))) + { + await txtBlob.UploadAsync(uploadStream, overwrite: true); + } + + Console.WriteLine($"Uploaded extracted text to: {txtBlobName}"); + } + } + } + } + + Console.WriteLine("Batch processing completed."); + } +} + +// --------------------------------------------------------------------------- +// Minimal stub implementations for Azure.Storage.Blobs types. +// These allow the sample to compile without adding the real Azure SDK NuGet package. +// In production you should replace them with the official Azure.Storage.Blobs package. +// --------------------------------------------------------------------------- +namespace Azure.Storage.Blobs +{ + public class BlobServiceClient + { + private readonly string _connectionString; + public BlobServiceClient(string connectionString) => _connectionString = connectionString; + public BlobContainerClient GetBlobContainerClient(string containerName) => new BlobContainerClient(containerName); + } + + public class BlobContainerClient + { + private readonly string _containerName; + public BlobContainerClient(string containerName) => _containerName = containerName; + public Task CreateIfNotExistsAsync() => Task.CompletedTask; + public BlobClient GetBlobClient(string blobName) => new BlobClient(blobName); + public async IAsyncEnumerable GetBlobsAsync([EnumeratorCancellation] System.Threading.CancellationToken cancellationToken = default) + { + // Stub: return an empty collection. Replace with real enumeration when using Azure SDK. + await Task.Yield(); + yield break; + } + } + + public class BlobClient + { + private readonly string _blobName; + public BlobClient(string blobName) => _blobName = blobName; + public Task DownloadToAsync(Stream destination) + { + // Stub: write an empty PDF header so Aspose can bind without error. + // In real usage the SDK streams the actual blob content. + byte[] emptyPdf = Encoding.ASCII.GetBytes("%PDF-1.4\n%%EOF"); + return destination.WriteAsync(emptyPdf, 0, emptyPdf.Length); + } + public Task UploadAsync(Stream source, bool overwrite = false) + { + // Stub: simply consume the stream. + return source.CopyToAsync(Stream.Null); + } + } +} + +namespace Azure.Storage.Blobs.Models +{ + public class BlobItem + { + public string Name { get; set; } + public BlobItem() { } + public BlobItem(string name) => Name = name; + } +} diff --git a/facades-extract-images-and-text/batch-extract-text-from-pdfs.cs b/facades-extract-images-and-text/batch-extract-text-from-pdfs.cs new file mode 100644 index 00000000..7b98d4df --- /dev/null +++ b/facades-extract-images-and-text/batch-extract-text-from-pdfs.cs @@ -0,0 +1,48 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + // Folder containing PDF files + const string inputFolder = "InputPdfs"; + // Folder where extracted text files will be saved + const string outputFolder = "ExtractedText"; + + if (!Directory.Exists(inputFolder)) + { + Console.Error.WriteLine($"Input folder not found: {inputFolder}"); + return; + } + + Directory.CreateDirectory(outputFolder); + + // Get all PDF files in the input folder + string[] pdfFiles = Directory.GetFiles(inputFolder, "*.pdf", SearchOption.TopDirectoryOnly); + + foreach (string pdfPath in pdfFiles) + { + string fileNameWithoutExt = Path.GetFileNameWithoutExtension(pdfPath); + string txtPath = Path.Combine(outputFolder, fileNameWithoutExt + ".txt"); + + try + { + // Use PdfExtractor to extract text from the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); // Initialize the facade with the PDF file + extractor.ExtractText(); // Extract text from all pages + extractor.GetText(txtPath); // Save extracted text to a .txt file + } + + Console.WriteLine($"Extracted text saved to: {txtPath}"); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error processing '{pdfPath}': {ex.Message}"); + } + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/batch-extract-text-from-pdfs__v2.cs b/facades-extract-images-and-text/batch-extract-text-from-pdfs__v2.cs new file mode 100644 index 00000000..19bde568 --- /dev/null +++ b/facades-extract-images-and-text/batch-extract-text-from-pdfs__v2.cs @@ -0,0 +1,43 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main(string[] args) + { + // Resolve input and output directories relative to the executable location. + // This makes the script platform‑agnostic (works on Windows, Linux, macOS). + string baseDir = AppDomain.CurrentDomain.BaseDirectory; + string inputDir = Path.Combine(baseDir, "PdfInput"); + string outputDir = Path.Combine(baseDir, "PdfTextOutput"); + + // Ensure the input directory exists; if not, inform the user and exit gracefully. + if (!Directory.Exists(inputDir)) + { + Console.Error.WriteLine($"Input directory does not exist: {inputDir}"); + return; + } + + // Ensure the output directory exists. + Directory.CreateDirectory(outputDir); + + // Process each PDF file in the input directory. + foreach (string pdfPath in Directory.GetFiles(inputDir, "*.pdf")) + { + string txtPath = Path.Combine(outputDir, Path.GetFileNameWithoutExtension(pdfPath) + ".txt"); + + // Initialize PdfExtractor (uses the default constructor). + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor. + extractor.BindPdf(pdfPath); + // Extract all text from the PDF (Unicode encoding is default). + extractor.ExtractText(); + // Save the extracted text to a .txt file. + extractor.GetText(txtPath); + // No need to call Close() explicitly – the using statement disposes the extractor. + } + } + } +} diff --git a/facades-extract-images-and-text/check-pdf-contains-text.cs b/facades-extract-images-and-text/check-pdf-contains-text.cs new file mode 100644 index 00000000..48e77f96 --- /dev/null +++ b/facades-extract-images-and-text/check-pdf-contains-text.cs @@ -0,0 +1,39 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPath = "input.pdf"; + + if (!File.Exists(inputPath)) + { + Console.Error.WriteLine($"File not found: {inputPath}"); + return; + } + + bool containsText = false; + + // Use PdfExtractor (Facade) to extract text from the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPath); + + // Extract all text (Unicode encoding is default) + extractor.ExtractText(); + + // Write the extracted text into a memory stream + using (MemoryStream textStream = new MemoryStream()) + { + extractor.GetText(textStream); + // If the stream length is greater than zero, the PDF had text + containsText = textStream.Length > 0; + } + } + + Console.WriteLine($"PDF contains text: {containsText}"); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/check-pdf-for-text-and-images.cs b/facades-extract-images-and-text/check-pdf-for-text-and-images.cs new file mode 100644 index 00000000..1137004b --- /dev/null +++ b/facades-extract-images-and-text/check-pdf-for-text-and-images.cs @@ -0,0 +1,61 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +namespace PdfUtilities +{ + public static class PdfContentChecker + { + /// + /// Returns true if the specified PDF file contains at least one text element and at least one image. + /// + /// Full path to the PDF file. + /// True when both text and images are present; otherwise false. + public static bool ContainsTextAndImages(string pdfPath) + { + if (string.IsNullOrEmpty(pdfPath)) + throw new ArgumentException("PDF path must be provided.", nameof(pdfPath)); + + if (!File.Exists(pdfPath)) + throw new FileNotFoundException("PDF file not found.", pdfPath); + + // PdfExtractor implements IDisposable, so use a using block for deterministic disposal. + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor. + extractor.BindPdf(pdfPath); + + // ---------- Check for text ---------- + extractor.ExtractText(); + bool hasText; + using (MemoryStream textStream = new MemoryStream()) + { + extractor.GetText(textStream); + hasText = textStream.Length > 0; // any bytes means text exists + } + + // ---------- Check for images ---------- + extractor.ExtractImage(); + bool hasImage = extractor.HasNextImage(); // true if at least one image is available + + // Return true only when both text and images are present. + return hasText && hasImage; + } + } + } + + // Dummy entry point required when the project is built as a console application. + internal class Program + { + private static void Main(string[] args) + { + // Optional demonstration: pass a PDF path as the first argument. + if (args.Length > 0) + { + string pdfPath = args[0]; + bool containsBoth = PdfContentChecker.ContainsTextAndImages(pdfPath); + Console.WriteLine($"PDF contains both text and images: {containsBoth}"); + } + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/check-pdf-text-only-by-detecting-images.cs b/facades-extract-images-and-text/check-pdf-text-only-by-detecting-images.cs new file mode 100644 index 00000000..09fe73d4 --- /dev/null +++ b/facades-extract-images-and-text/check-pdf-text-only-by-detecting-images.cs @@ -0,0 +1,39 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPath = "input.pdf"; + + if (!File.Exists(inputPath)) + { + Console.Error.WriteLine($"File not found: {inputPath}"); + return; + } + + // Use PdfExtractor (Facade) to check for images in the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPath); + + // Perform image extraction (required before querying HasNextImage) + extractor.ExtractImage(); + + // HasNextImage returns true if at least one image is present + bool hasImages = extractor.HasNextImage(); + + if (hasImages) + { + Console.WriteLine("The PDF contains images; it is not text‑only."); + } + else + { + Console.WriteLine("The PDF is text‑only (no images found)."); + } + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/configurable-pdf-extraction.cs b/facades-extract-images-and-text/configurable-pdf-extraction.cs new file mode 100644 index 00000000..fc8d13d4 --- /dev/null +++ b/facades-extract-images-and-text/configurable-pdf-extraction.cs @@ -0,0 +1,126 @@ +using System; +using System.IO; +using System.Text.Json; +using System.Collections.Generic; +using Aspose.Pdf.Facades; +using Aspose.Pdf; +using System.Drawing.Imaging; + +namespace PdfExtractionDemo +{ + // Configuration model matching the JSON file + public class ExtractionConfig + { + public bool ExtractText { get; set; } = true; + public bool ExtractImages { get; set; } = true; + public bool ExtractAttachments { get; set; } = true; + // Optional output directories + public string TextOutputPath { get; set; } = "ExtractedText.txt"; + public string ImagesOutputDir { get; set; } = "Images"; + public string AttachmentsOutputDir { get; set; } = "Attachments"; + } + + class Program + { + static void Main() + { + const string pdfPath = "input.pdf"; + const string configPath = "extractionConfig.json"; + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {pdfPath}"); + return; + } + + if (!File.Exists(configPath)) + { + Console.Error.WriteLine($"Config file not found: {configPath}"); + return; + } + + // Load configuration + ExtractionConfig config; + try + { + string json = File.ReadAllText(configPath); + config = JsonSerializer.Deserialize(json); + if (config == null) throw new Exception("Deserialization returned null."); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to read config: {ex.Message}"); + return; + } + + // Ensure output directories exist + if (config.ExtractImages && !Directory.Exists(config.ImagesOutputDir)) + Directory.CreateDirectory(config.ImagesOutputDir); + if (config.ExtractAttachments && !Directory.Exists(config.AttachmentsOutputDir)) + Directory.CreateDirectory(config.AttachmentsOutputDir); + + // Use PdfExtractor within a using block for deterministic disposal + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF + extractor.BindPdf(pdfPath); + + // -------- Text Extraction ---------- + if (config.ExtractText) + { + // Use pure text mode (0) – default; can be changed if needed + extractor.ExtractTextMode = 0; + extractor.ExtractText(); + extractor.GetText(config.TextOutputPath); + Console.WriteLine($"Text extracted to: {config.TextOutputPath}"); + } + + // -------- Image Extraction ---------- + if (config.ExtractImages) + { + // Extract all images defined in resources (default mode) + extractor.ExtractImageMode = ExtractImageMode.DefinedInResources; + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + string imagePath = Path.Combine(config.ImagesOutputDir, $"Image_{imageIndex}.png"); + // Save each image as PNG; you can change ImageFormat if required + extractor.GetNextImage(imagePath, ImageFormat.Png); + Console.WriteLine($"Image saved: {imagePath}"); + imageIndex++; + } + } + + // -------- Attachment Extraction ---------- + if (config.ExtractAttachments) + { + extractor.ExtractAttachment(); + + // Retrieve attachment names + IList attachmentNames = extractor.GetAttachNames(); + + // Get all attachments as streams + MemoryStream[] attachmentStreams = extractor.GetAttachment(); + + for (int i = 0; i < attachmentStreams.Length; i++) + { + string name = attachmentNames[i]; + string outputFile = Path.Combine(config.AttachmentsOutputDir, name); + + using (FileStream fs = new FileStream(outputFile, FileMode.Create, FileAccess.Write)) + { + attachmentStreams[i].Position = 0; + attachmentStreams[i].CopyTo(fs); + } + + Console.WriteLine($"Attachment saved: {outputFile}"); + } + } + } + + Console.WriteLine("Extraction completed."); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/count-pages-images-attachments-pdf.cs b/facades-extract-images-and-text/count-pages-images-attachments-pdf.cs new file mode 100644 index 00000000..32ec225d --- /dev/null +++ b/facades-extract-images-and-text/count-pages-images-attachments-pdf.cs @@ -0,0 +1,57 @@ +using System; +using System.IO; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "sample.pdf"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + // Get total number of pages using Document.Pages.Count (PdfPageEditor.GetPages() does not return a count) + int pageCount; + using (Document doc = new Document(inputPdf)) + { + pageCount = doc.Pages.Count; + } + Console.WriteLine($"Pages: {pageCount}"); + + // Get total number of images using PdfExtractor + int imageCount = 0; + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + extractor.ExtractImage(); + + while (extractor.HasNextImage()) + { + // Retrieve each image into a dummy stream; we only need the count + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetNextImage(ms); + } + imageCount++; + } + } + Console.WriteLine($"Images: {imageCount}"); + + // Get total number of attachments using PdfExtractor + int attachmentCount = 0; + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + extractor.ExtractAttachment(); + + var attachmentInfo = extractor.GetAttachmentInfo(); + attachmentCount = attachmentInfo?.Count ?? 0; + } + Console.WriteLine($"Attachments: {attachmentCount}"); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/create-contact-sheet-pdf.cs b/facades-extract-images-and-text/create-contact-sheet-pdf.cs new file mode 100644 index 00000000..509b7162 --- /dev/null +++ b/facades-extract-images-and-text/create-contact-sheet-pdf.cs @@ -0,0 +1,109 @@ +using System; +using System.IO; +using System.Collections.Generic; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; + const string outputPdfPath = "contact_sheet.pdf"; + const string tempImageDir = "temp_images"; + + // Thumbnail size and grid layout + const double thumbWidth = 100; // points + const double thumbHeight = 100; // points + const int columns = 5; + const double margin = 10; // points between images + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input file not found: {inputPdfPath}"); + return; + } + + // Ensure temporary folder exists + if (Directory.Exists(tempImageDir)) + Directory.Delete(tempImageDir, true); + Directory.CreateDirectory(tempImageDir); + + // ----------------------------------------------------------------- + // Step 1: Extract all images from the source PDF to temporary files + // ----------------------------------------------------------------- + List extractedImagePaths = new List(); + PdfExtractor extractor = new PdfExtractor(); + extractor.BindPdf(inputPdfPath); + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + string tempImagePath = Path.Combine(tempImageDir, $"img_{imageIndex}.png"); + extractor.GetNextImage(tempImagePath); + extractedImagePaths.Add(tempImagePath); + imageIndex++; + } + + // ----------------------------------------------------------------- + // Step 2: Create a new PDF that will hold the contact sheet + // ----------------------------------------------------------------- + using (Document contactDoc = new Document()) + { + // Add a single page (default size A4) + Aspose.Pdf.Page contactPage = contactDoc.Pages.Add(); + + // Page dimensions (points) + double pageWidth = contactPage.PageInfo.Width; + double pageHeight = contactPage.PageInfo.Height; + + // Calculate positions for each thumbnail + int currentColumn = 0; + int currentRow = 0; + + foreach (string imgPath in extractedImagePaths) + { + double x = margin + currentColumn * (thumbWidth + margin); + double y = pageHeight - margin - (currentRow + 1) * (thumbHeight + margin); + + // Add the image at the calculated rectangle + contactPage.AddImage( + imgPath, + new Aspose.Pdf.Rectangle(x, y, x + thumbWidth, y + thumbHeight)); + + // Move to next cell in the grid + currentColumn++; + if (currentColumn >= columns) + { + currentColumn = 0; + currentRow++; + } + + // If the next image would exceed the page height, add a new page + if (y - thumbHeight - margin < 0 && currentColumn == 0) + { + contactPage = contactDoc.Pages.Add(); + pageHeight = contactPage.PageInfo.Height; + } + } + + // Save the contact sheet PDF + contactDoc.Save(outputPdfPath); + } + + // ----------------------------------------------------------------- + // Cleanup temporary images + // ----------------------------------------------------------------- + try + { + Directory.Delete(tempImageDir, true); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to delete temporary folder: {ex.Message}"); + } + + Console.WriteLine($"Contact sheet created: {outputPdfPath}"); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-all-images-from-pdf.cs b/facades-extract-images-and-text/extract-all-images-from-pdf.cs new file mode 100644 index 00000000..671515e0 --- /dev/null +++ b/facades-extract-images-and-text/extract-all-images-from-pdf.cs @@ -0,0 +1,48 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "sample.pdf"; + const string outputFolder = "ExtractedImages"; + + // Verify input file exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure output directory exists + Directory.CreateDirectory(outputFolder); + + // PdfExtractor is a Facade that implements IDisposable + using (PdfExtractor extractor = new PdfExtractor()) + { + // Load the PDF document + extractor.BindPdf(inputPdf); + + // Set page range: 1 to 0 means all pages + extractor.StartPage = 1; + extractor.EndPage = 0; + + // Perform image extraction for the specified range + extractor.ExtractImage(); + + int imageIndex = 1; + // Iterate through all extracted images + while (extractor.HasNextImage()) + { + string outPath = Path.Combine(outputFolder, $"image-{imageIndex}.png"); + // Save each image; default format is used (can specify ImageFormat if needed) + extractor.GetNextImage(outPath); + imageIndex++; + } + } + + Console.WriteLine("Image extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-first-three-pages-summary-pdf.cs b/facades-extract-images-and-text/extract-first-three-pages-summary-pdf.cs new file mode 100644 index 00000000..99dee8df --- /dev/null +++ b/facades-extract-images-and-text/extract-first-three-pages-summary-pdf.cs @@ -0,0 +1,58 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf; +using Aspose.Pdf.Facades; +using Aspose.Pdf.Text; // TextFragment, FontRepository + +class Program +{ + static void Main() + { + const string inputPath = "input.pdf"; + const string outputPath = "summary.pdf"; + + if (!File.Exists(inputPath)) + { + Console.Error.WriteLine($"Input file not found: {inputPath}"); + return; + } + + // Extract text from the first three pages using PdfExtractor (Facades API) + string extractedText; + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPath); + extractor.StartPage = 1; // 1‑based indexing + extractor.EndPage = 3; + extractor.ExtractText(); + + using (MemoryStream textStream = new MemoryStream()) + { + extractor.GetText(textStream); + extractedText = Encoding.UTF8.GetString(textStream.ToArray()); + } + } + + // Create a new PDF document that contains the extracted text + using (Document summaryDoc = new Document()) + { + // Add a single page to hold the summary + Page page = summaryDoc.Pages.Add(); + + // Create a TextFragment with the extracted text + TextFragment fragment = new TextFragment(extractedText); + fragment.TextState.Font = FontRepository.FindFont("Helvetica"); + fragment.TextState.FontSize = 12; + fragment.TextState.ForegroundColor = Aspose.Pdf.Color.Black; + + // Add the fragment to the page + page.Paragraphs.Add(fragment); + + // Save the summary PDF + summaryDoc.Save(outputPath); + } + + Console.WriteLine($"Summary PDF created at '{outputPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-by-keyword.cs b/facades-extract-images-and-text/extract-images-by-keyword.cs new file mode 100644 index 00000000..02b4f042 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-by-keyword.cs @@ -0,0 +1,79 @@ +using System; +using System.IO; +using System.Text; +using System.Drawing.Imaging; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; // source PDF + const string outputDir = "ExtractedImages"; // folder for images + const string keyword = "YOUR_KEYWORD"; // text to search for (case‑sensitive) + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + Directory.CreateDirectory(outputDir); + + // Load the document once to obtain the total page count. + using (Document doc = new Document(inputPdf)) + { + int pageCount = doc.Pages.Count; + + // PdfExtractor is a disposable facade – use a using block. + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor. + extractor.BindPdf(inputPdf); + + // Iterate through each page. + for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) + { + // Restrict operations to the current page. + extractor.StartPage = pageNumber; + extractor.EndPage = pageNumber; + + // ----- STEP 1: extract text from the page ----- + extractor.ExtractText(); + + // Capture the extracted text into a memory stream. + using (MemoryStream textStream = new MemoryStream()) + { + extractor.GetText(textStream); + string pageText = Encoding.UTF8.GetString(textStream.ToArray()); + + // Check whether the page contains the target keyword. + if (pageText.Contains(keyword)) + { + // ----- STEP 2: extract images from the same page ----- + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + string imagePath = Path.Combine( + outputDir, + $"page_{pageNumber}_img_{imageIndex}.png"); + + // Save the image as PNG (any ImageFormat is acceptable). + extractor.GetNextImage(imagePath, ImageFormat.Png); + imageIndex++; + } + } + } + } + + // Release any resources held by the extractor. + extractor.Close(); + } + } + + Console.WriteLine("Image extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-create-pdfa2b.cs b/facades-extract-images-and-text/extract-images-create-pdfa2b.cs new file mode 100644 index 00000000..98d88833 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-create-pdfa2b.cs @@ -0,0 +1,60 @@ +using System; +using System.IO; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputPdf = "output_pdfa2b.pdf"; + const string conversionLog = "conversion_log.xml"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Extract images from the source PDF using the Facade API + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + extractor.ExtractImage(); + + // Create a new PDF document that will become PDF/A‑2b + using (Document pdfaDoc = new Document()) + { + // Iterate over all extracted images + while (extractor.HasNextImage()) + { + // Store the current image in a memory stream + using (MemoryStream imgStream = new MemoryStream()) + { + extractor.GetNextImage(imgStream); + imgStream.Position = 0; + + // Add a new page for each image + Page page = pdfaDoc.Pages.Add(); + + // Use the page size to fill the whole page with the image + double pageWidth = page.PageInfo.Width; + double pageHeight = page.PageInfo.Height; + + // Embed the image as an XObject + page.AddImage(imgStream, new Aspose.Pdf.Rectangle(0, 0, pageWidth, pageHeight)); + } + } + + // Convert the document to PDF/A‑2b compliance + pdfaDoc.Convert(conversionLog, PdfFormat.PDF_A_2B, ConvertErrorAction.Delete); + + // Save the compliant document + pdfaDoc.Save(outputPdf); + } + } + + Console.WriteLine($"PDF/A‑2b document created: {outputPdf}"); + } +} diff --git a/facades-extract-images-and-text/extract-images-create-portfolio-pdf.cs b/facades-extract-images-and-text/extract-images-create-portfolio-pdf.cs new file mode 100644 index 00000000..c2d69753 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-create-portfolio-pdf.cs @@ -0,0 +1,73 @@ +using System; +using System.IO; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputPortfolio = "portfolio.pdf"; + const string tempFolder = "temp_images"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure a clean temporary folder + if (Directory.Exists(tempFolder)) + Directory.Delete(tempFolder, true); + Directory.CreateDirectory(tempFolder); + + // ---------- Extract images from the source PDF ---------- + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Save each extracted image as a separate file (default format is JPEG) + string imagePath = Path.Combine(tempFolder, $"image_{imageIndex}.jpg"); + extractor.GetNextImage(imagePath); + imageIndex++; + } + } + + // ---------- Build a PDF portfolio where each image occupies one page ---------- + using (Document portfolio = new Document()) + { + foreach (string imageFile in Directory.GetFiles(tempFolder)) + { + // Add a new blank page + Page page = portfolio.Pages.Add(); + + // Create an Image object that references the extracted file + Image img = new Image(); + img.File = imageFile; + + // Add the image to the page's content + page.Paragraphs.Add(img); + } + + // Save the resulting portfolio PDF + portfolio.Save(outputPortfolio); + } + + // Cleanup temporary image files + try + { + Directory.Delete(tempFolder, true); + } + catch + { + // Ignored – if cleanup fails, the OS will eventually reclaim the temp folder + } + + Console.WriteLine($"Portfolio PDF created at '{outputPortfolio}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-create-sprite-sheet.cs b/facades-extract-images-and-text/extract-images-create-sprite-sheet.cs new file mode 100644 index 00000000..ccec23ea --- /dev/null +++ b/facades-extract-images-and-text/extract-images-create-sprite-sheet.cs @@ -0,0 +1,118 @@ +using System; +using System.IO; +using System.Collections.Generic; +using Aspose.Pdf.Facades; // PdfExtractor, ImageMergeMode, PdfConverter +using Aspose.Pdf.Drawing; // ImageFormat + +// --------------------------------------------------------------------------- +// Minimal NUnit stubs – required only when the NUnit package is not referenced. +// They allow any leftover test attributes in the source to compile without +// pulling the full NUnit library. +// --------------------------------------------------------------------------- +namespace NUnit.Framework +{ + [AttributeUsage(AttributeTargets.Class)] + public sealed class TestFixtureAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class TestAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class SetUpAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class TearDownAttribute : Attribute { } + + public delegate void TestDelegate(); + + public static class Assert + { + public static void AreEqual(T expected, T actual, string message = null) + { + if (!object.Equals(expected, actual)) + throw new Exception(message ?? $"Assert.AreEqual failed. Expected:<{expected}>. Actual:<{actual}>."); + } + + public static T Throws(TestDelegate code) where T : Exception + { + try + { + code(); + } + catch (T ex) + { + return ex; + } + catch (Exception ex) + { + throw new Exception($"Assert.Throws failed. Expected {typeof(T)} but got {ex.GetType()}.", ex); + } + throw new Exception($"Assert.Throws failed. No exception thrown. Expected {typeof(T)}."); + } + } +} + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputPng = "sprite_sheet.png"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // List to hold each extracted image as a stream + List imageStreams = new List(); + + // Extract images from the PDF using PdfExtractor + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + extractor.ExtractImage(); // start image extraction + + while (extractor.HasNextImage()) + { + // Store each image in a memory stream + MemoryStream ms = new MemoryStream(); + // GetNextImage writes the image (default format JPEG) into the stream + extractor.GetNextImage(ms); + ms.Position = 0; // reset for later reading + imageStreams.Add(ms); + } + } + + if (imageStreams.Count == 0) + { + Console.WriteLine("No images were found in the PDF."); + return; + } + + // Merge all extracted images into a single sprite sheet (horizontal layout) + // ImageMergeMode.Horizontal creates a side‑by‑side sprite sheet. + // Spacing and margin are set to 0 (no extra spacing). + Stream mergedStream = PdfConverter.MergeImages( + imageStreams, + ImageFormat.Png, + ImageMergeMode.Horizontal, + 0, // spacing + 0); // margin + + // Save the merged sprite sheet to a PNG file + using (FileStream outFile = new FileStream(outputPng, FileMode.Create, FileAccess.Write)) + { + mergedStream.CopyTo(outFile); + } + + // Clean up the individual image streams + foreach (var s in imageStreams) + s.Dispose(); + + mergedStream.Dispose(); + + Console.WriteLine($"Sprite sheet created: {outputPng}"); + } +} diff --git a/facades-extract-images-and-text/extract-images-from-encrypted-pdf.cs b/facades-extract-images-and-text/extract-images-from-encrypted-pdf.cs new file mode 100644 index 00000000..4f0a1182 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-encrypted-pdf.cs @@ -0,0 +1,46 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "encrypted.pdf"; // Encrypted PDF file + const string outputFolder = "ExtractedImages"; // Folder for extracted images + const string userPassword = "user123"; // User password for the PDF + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputFolder); + + // Use PdfExtractor (facade) to handle encrypted PDFs + using (PdfExtractor extractor = new PdfExtractor()) + { + // Supply the password before binding the PDF + extractor.Password = userPassword; + + // Bind the encrypted PDF file + extractor.BindPdf(inputPdf); + + // Perform image extraction + extractor.ExtractImage(); + + int imageIndex = 1; + // Retrieve each extracted image and save it to a file + while (extractor.HasNextImage()) + { + string imagePath = Path.Combine(outputFolder, $"image-{imageIndex}.png"); + extractor.GetNextImage(imagePath); + imageIndex++; + } + } + + Console.WriteLine($"Image extraction completed. Images saved to '{outputFolder}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-from-first-pdf-page.cs b/facades-extract-images-and-text/extract-images-from-first-pdf-page.cs new file mode 100644 index 00000000..f1b65d23 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-first-pdf-page.cs @@ -0,0 +1,58 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "sample.pdf"; + + // Container for the extracted image byte arrays + List extractedImages = new List(); + + // Verify that the source PDF exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + // Use PdfExtractor (a Facade) to pull images from the first page only + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdf); + + // Restrict extraction to page 1 + extractor.StartPage = 1; + extractor.EndPage = 1; + + // Perform the image extraction operation + extractor.ExtractImage(); + + // Retrieve each image as a byte array + while (extractor.HasNextImage()) + { + using (MemoryStream ms = new MemoryStream()) + { + // Store the current image into the memory stream (default format) + extractor.GetNextImage(ms); + + // Add the resulting byte array to the list + extractedImages.Add(ms.ToArray()); + } + } + } + + // Example output: number of images and their sizes + Console.WriteLine($"Extracted {extractedImages.Count} image(s) from page 1."); + for (int i = 0; i < extractedImages.Count; i++) + { + Console.WriteLine($"Image {i + 1}: {extractedImages[i].Length} bytes"); + } + + // The 'extractedImages' list now holds the raw image data for further analysis + } +} diff --git a/facades-extract-images-and-text/extract-images-from-pdf-and-compress-png.cs b/facades-extract-images-and-text/extract-images-from-pdf-and-compress-png.cs new file mode 100644 index 00000000..b176cfcc --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf-and-compress-png.cs @@ -0,0 +1,66 @@ +using System; +using System.IO; +using System.IO.Compression; +using System.Drawing.Imaging; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "sample.pdf"; // source PDF + const string outputDir = "ExtractedImages"; // folder for PNGs + const string compressedDir = "CompressedImages"; // folder for compressed files + + // Verify that the source PDF exists before proceeding + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Error: PDF file '{inputPdf}' not found."); + return; + } + + // Ensure output folders exist + Directory.CreateDirectory(outputDir); + Directory.CreateDirectory(compressedDir); + + // Use PdfExtractor (Facade) to pull images from the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); // bind the PDF file + extractor.ExtractImage(); // prepare image extraction + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Build PNG file name + string pngPath = Path.Combine(outputDir, $"image-{imageIndex}.png"); + + // Save the next image as PNG + // The ImageFormat.Png API is Windows‑only and triggers CA1416. + // Suppress the warning because the code runs on Windows where Aspose.Pdf.Facades requires System.Drawing. +#pragma warning disable CA1416 // Validate platform compatibility + extractor.GetNextImage(pngPath, ImageFormat.Png); +#pragma warning restore CA1416 // Validate platform compatibility + + // ----- Lossless compression of the PNG ----- + // Read the PNG bytes + byte[] pngBytes = File.ReadAllBytes(pngPath); + + // Create a .gz file (GZip provides lossless compression) + string gzPath = Path.Combine(compressedDir, $"image-{imageIndex}.png.gz"); + using (FileStream gzFile = new FileStream(gzPath, FileMode.Create, FileAccess.Write)) + using (GZipStream gzip = new GZipStream(gzFile, CompressionLevel.Optimal)) + { + gzip.Write(pngBytes, 0, pngBytes.Length); + } + + // Optional: delete the original PNG if only compressed version is needed + // File.Delete(pngPath); + + imageIndex++; + } + } + + Console.WriteLine("Image extraction and compression completed."); + } +} diff --git a/facades-extract-images-and-text/extract-images-from-pdf-csv.cs b/facades-extract-images-and-text/extract-images-from-pdf-csv.cs new file mode 100644 index 00000000..3e3a2bca --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf-csv.cs @@ -0,0 +1,75 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; +using System.Drawing; +using System.Drawing.Imaging; + +class Program +{ + static void Main() + { + const string pdfPath = "input.pdf"; // source PDF + const string imagesFolder = "ExtractedImages"; // folder for images + const string csvPath = "images.csv"; // output CSV file + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {pdfPath}"); + return; + } + + Directory.CreateDirectory(imagesFolder); + + StringBuilder csvBuilder = new StringBuilder(); + csvBuilder.AppendLine("Filename,PageNumber,Width,Height"); // CSV header + + // Use PdfExtractor (facade) inside a using block for deterministic disposal + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file + extractor.BindPdf(pdfPath); + + // Total number of pages in the document + int pageCount = extractor.Document.Pages.Count; + int imageCounter = 1; + + // Process each page individually to capture page numbers + for (int page = 1; page <= pageCount; page++) + { + extractor.StartPage = page; + extractor.EndPage = page; + + // Extract images from the current page + extractor.ExtractImage(); + + // Retrieve each extracted image + while (extractor.HasNextImage()) + { + string imageFileName = $"image_page{page}_{imageCounter}.png"; + string imagePath = Path.Combine(imagesFolder, imageFileName); + + // Save the image as PNG + extractor.GetNextImage(imagePath, ImageFormat.Png); + + // Determine image dimensions + int width, height; + using (var img = Image.FromFile(imagePath)) + { + width = img.Width; + height = img.Height; + } + + // Append CSV line + csvBuilder.AppendLine($"{imageFileName},{page},{width},{height}"); + + imageCounter++; + } + } + } + + // Write CSV content to file + File.WriteAllText(csvPath, csvBuilder.ToString(), Encoding.UTF8); + Console.WriteLine($"Extraction complete. Images saved to '{imagesFolder}'. CSV saved to '{csvPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-from-pdf-pages-png.cs b/facades-extract-images-and-text/extract-images-from-pdf-pages-png.cs new file mode 100644 index 00000000..86d73dc4 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf-pages-png.cs @@ -0,0 +1,49 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; +using System.Drawing.Imaging; // ImageFormat for PNG + +class Program +{ + static void Main() + { + const string inputPath = "input.pdf"; + const string outputDir = "ExtractedImages"; + + if (!File.Exists(inputPath)) + { + Console.Error.WriteLine($"File not found: {inputPath}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputDir); + + // Initialize the PDF extractor and bind the source PDF + PdfExtractor extractor = new PdfExtractor(); + extractor.BindPdf(inputPath); + + // Restrict extraction to pages 5 through 10 (1‑based indexing) + extractor.StartPage = 5; + extractor.EndPage = 10; + + // Perform image extraction for the specified page range + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Build a unique file name for each extracted image + string outPath = Path.Combine(outputDir, $"image_{imageIndex}.png"); + + // Save the image as PNG + extractor.GetNextImage(outPath, ImageFormat.Png); + imageIndex++; + } + + // Release resources held by the extractor + extractor.Close(); + + Console.WriteLine($"Extraction complete. {imageIndex - 1} images saved to '{outputDir}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-from-pdf-to-gcs.cs b/facades-extract-images-and-text/extract-images-from-pdf-to-gcs.cs new file mode 100644 index 00000000..24f58308 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf-to-gcs.cs @@ -0,0 +1,88 @@ +using System; +using System.IO; +using System.Drawing.Imaging; +using Aspose.Pdf.Facades; + +// --------------------------------------------------------------------------- +// Minimal stubs for Google.Cloud.Storage.V1 when the real NuGet package is not +// available. These provide just enough members used by the sample code so the +// project can compile and run (the stub simply writes a message to the console). +// --------------------------------------------------------------------------- +namespace Google.Cloud.Storage.V1 +{ + public enum PredefinedObjectAcl + { + PublicRead + } + + public class UploadObjectOptions + { + public PredefinedObjectAcl PredefinedAcl { get; set; } + } + + public class StorageClient + { + public static StorageClient Create() => new StorageClient(); + + public void UploadObject(string bucketName, string objectName, string contentType, Stream source, UploadObjectOptions options = null) + { + // In a real implementation this would upload to GCS. The stub just + // reports the action so the sample can be executed without external + // dependencies. + Console.WriteLine($"[Stub] Uploaded '{objectName}' to bucket '{bucketName}' with content type '{contentType}'."); + } + } +} + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; // Path to the source PDF + const string bucketName = "my-gcs-bucket"; // Google Cloud Storage bucket + const string gcsFolder = "extracted-images/"; // Optional folder/prefix in the bucket + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdfPath}"); + return; + } + + // Initialize Google Cloud Storage client (uses Application Default Credentials) + var storageClient = Google.Cloud.Storage.V1.StorageClient.Create(); + + // Use Aspose.Pdf.Facades.PdfExtractor to pull images from the PDF + using (var extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdfPath); // Load the PDF + extractor.ExtractImage(); // Prepare image extraction + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Capture the next image into a memory stream as PNG + using (var imageStream = new MemoryStream()) + { + extractor.GetNextImage(imageStream, ImageFormat.Png); + imageStream.Position = 0; // Reset for upload + + // Build the object name (e.g., extracted-images/image-1.png) + string objectName = $"{gcsFolder}image-{imageIndex}.png"; + + // Upload with public read access + var uploadOptions = new Google.Cloud.Storage.V1.UploadObjectOptions + { + PredefinedAcl = Google.Cloud.Storage.V1.PredefinedObjectAcl.PublicRead + }; + storageClient.UploadObject(bucketName, objectName, "image/png", imageStream, uploadOptions); + + Console.WriteLine($"Uploaded {objectName} to bucket {bucketName}"); + } + + imageIndex++; + } + } + + Console.WriteLine("All images extracted and uploaded to Google Cloud Storage."); + } +} diff --git a/facades-extract-images-and-text/extract-images-from-pdf-to-zip.cs b/facades-extract-images-and-text/extract-images-from-pdf-to-zip.cs new file mode 100644 index 00000000..51e7e4af --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf-to-zip.cs @@ -0,0 +1,52 @@ +using System; +using System.IO; +using System.IO.Compression; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputZip = "images.zip"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + // Extract images and add them to a zip archive + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + extractor.ExtractImage(); + + using (FileStream zipFile = new FileStream(outputZip, FileMode.Create)) + using (ZipArchive zip = new ZipArchive(zipFile, ZipArchiveMode.Create)) + { + int imageIndex = 1; + while (extractor.HasNextImage()) + { + using (MemoryStream imgStream = new MemoryStream()) + { + // Extract next image as JPEG (default format) into the memory stream + extractor.GetNextImage(imgStream); + imgStream.Position = 0; + + // Create a zip entry for the image + string entryName = $"image-{imageIndex}.jpg"; + ZipArchiveEntry entry = zip.CreateEntry(entryName, CompressionLevel.Optimal); + using (Stream entryStream = entry.Open()) + { + imgStream.CopyTo(entryStream); + } + } + imageIndex++; + } + } + } + + Console.WriteLine($"Images extracted to '{outputZip}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-from-pdf-using-pdfextractor.cs b/facades-extract-images-and-text/extract-images-from-pdf-using-pdfextractor.cs new file mode 100644 index 00000000..4e2e8239 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf-using-pdfextractor.cs @@ -0,0 +1,50 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + // Input PDF file containing images + const string pdfPath = "input.pdf"; + + // Output folder for extracted images + const string outputFolder = "ExtractedImages"; + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"File not found: {pdfPath}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputFolder); + + // Use a using block so PdfExtractor is disposed automatically + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Start the image extraction process + extractor.ExtractImage(); + + int imageIndex = 1; + // Retrieve each image while they are available + while (extractor.HasNextImage()) + { + // Build a file name for the extracted image + string imagePath = Path.Combine(outputFolder, $"image-{imageIndex}.jpg"); + + // Save the next image to the file (default format is JPEG) + extractor.GetNextImage(imagePath); + + Console.WriteLine($"Extracted image {imageIndex} to '{imagePath}'"); + imageIndex++; + } + } + + Console.WriteLine("Image extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-from-pdf-with-guid-filenames.cs b/facades-extract-images-and-text/extract-images-from-pdf-with-guid-filenames.cs new file mode 100644 index 00000000..40397607 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf-with-guid-filenames.cs @@ -0,0 +1,46 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputDir = "ExtractedImages"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputDir); + + // Use PdfExtractor (implements IDisposable) inside a using block + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF + extractor.BindPdf(inputPdf); + + // Prepare the extractor to pull images + extractor.ExtractImage(); + + // Iterate over all images in the document + while (extractor.HasNextImage()) + { + // Generate a unique file name using a GUID + string guidFileName = Guid.NewGuid().ToString() + ".png"; // extension can be any supported type + + // Full path for the extracted image + string outputPath = Path.Combine(outputDir, guidFileName); + + // Save the image. The overload without ImageFormat saves the image in its original format. + extractor.GetNextImage(outputPath); + } + } + + Console.WriteLine("Image extraction completed."); + } +} diff --git a/facades-extract-images-and-text/extract-images-from-pdf.cs b/facades-extract-images-and-text/extract-images-from-pdf.cs new file mode 100644 index 00000000..2e819c3b --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-pdf.cs @@ -0,0 +1,52 @@ +using System; +using System.IO; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + // Path to the source PDF file + const string inputPdfPath = "input.pdf"; + + // Ensure the input PDF exists; create a minimal PDF if it does not. + if (!File.Exists(inputPdfPath)) + { + using (var doc = new Document()) + { + doc.Pages.Add(); // add a blank page + doc.Save(inputPdfPath); + } + } + + // Create a unique temporary folder for extracted images + string tempFolder = Path.Combine(Path.GetTempPath(), "PdfImages_" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(tempFolder); + + // Initialize the PdfExtractor facade + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF document to the extractor + extractor.BindPdf(inputPdfPath); + + // Extract images using the default extraction mode (DefinedInResources) + extractor.ExtractImage(); + + int imageIndex = 1; + // Iterate through all extracted images + while (extractor.HasNextImage()) + { + // Build the output file name (preserve original image format) + string outputImagePath = Path.Combine(tempFolder, $"image-{imageIndex}.img"); + + // Save the current image to the file system using the overload that does not require ImageFormat + extractor.GetNextImage(outputImagePath); + + imageIndex++; + } + } + + Console.WriteLine($"Images extracted to temporary folder: {tempFolder}"); + } +} diff --git a/facades-extract-images-and-text/extract-images-from-specific-pdf-page.cs b/facades-extract-images-and-text/extract-images-from-specific-pdf-page.cs new file mode 100644 index 00000000..d3deff14 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-from-specific-pdf-page.cs @@ -0,0 +1,53 @@ +using System; +using System.IO; +using System.Drawing.Imaging; // Added for ImageFormat +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; // source PDF + const string outputDir = "ExtractedImages"; // folder for extracted images + const int pageNumber = 2; // page to extract images from (1‑based) + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + // Ensure output directory exists + Directory.CreateDirectory(outputDir); + + // PdfExtractor is a Facade; it implements IDisposable, so use a using block + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdf); + + // Limit extraction to a single page + extractor.StartPage = pageNumber; + extractor.EndPage = pageNumber; + + // Extract images from the specified page + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Build output file name: page{page}_image{index}.png + string outputPath = Path.Combine( + outputDir, + $"page{pageNumber}_image{imageIndex}.png"); + + // Save the next image in PNG format + extractor.GetNextImage(outputPath, ImageFormat.Png); + Console.WriteLine($"Saved image {imageIndex} to '{outputPath}'"); + imageIndex++; + } + } + + Console.WriteLine("Image extraction completed."); + } +} diff --git a/facades-extract-images-and-text/extract-images-markdown-gallery.cs b/facades-extract-images-and-text/extract-images-markdown-gallery.cs new file mode 100644 index 00000000..ae314ac1 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-markdown-gallery.cs @@ -0,0 +1,58 @@ +using System; +using System.IO; +using System.Collections.Generic; +using Aspose.Pdf.Facades; +using System.Drawing.Imaging; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string imagesFolder = "images"; + const string markdownFile = "gallery.md"; + + // Verify the source PDF exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdf}"); + return; + } + + // Ensure the output folder for images exists + Directory.CreateDirectory(imagesFolder); + + // Prepare a list to hold markdown lines + List markdownLines = new List(); + markdownLines.Add("# Image Gallery"); + markdownLines.Add(string.Empty); + + // Use PdfExtractor to pull images from the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + // Optional: choose a different extraction mode + // extractor.ExtractImageMode = ExtractImageMode.ActuallyUsed; + extractor.ExtractImage(); + + int index = 1; + while (extractor.HasNextImage()) + { + string imageFileName = $"image-{index}.png"; + string imagePath = Path.Combine(imagesFolder, imageFileName); + + // Save the next image as PNG + extractor.GetNextImage(imagePath, ImageFormat.Png); + + // Add a markdown entry for the saved image + markdownLines.Add($"![Image {index}]({Path.Combine(imagesFolder, imageFileName)})"); + markdownLines.Add(string.Empty); + index++; + } + } + + // Write the markdown file + File.WriteAllLines(markdownFile, markdownLines); + Console.WriteLine($"Extraction complete. Images saved to '{imagesFolder}'. Markdown gallery created at '{markdownFile}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-ocr-openai.cs b/facades-extract-images-and-text/extract-images-ocr-openai.cs new file mode 100644 index 00000000..23aeafb3 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-ocr-openai.cs @@ -0,0 +1,92 @@ +using System; +using System.IO; +using System.Collections.Generic; +using System.Threading.Tasks; +using Aspose.Pdf; +using Aspose.Pdf.Facades; // PdfExtractor +using Aspose.Pdf.AI; // OpenAI OCR copilot classes + +class Program +{ + // Async Main entry point (C# 7.1+) + static async Task Main(string[] args) + { + // Input PDF path + const string pdfPath = "input.pdf"; + + // Ensure a PDF exists – create a minimal one if it does not. + if (!File.Exists(pdfPath)) + { + // Create a simple PDF with a single blank page. + using (Document doc = new Document()) + { + doc.Pages.Add(); + doc.Save(pdfPath); + } + Console.WriteLine($"Sample PDF created at '{pdfPath}' because the file was missing."); + } + + // Directory to store extracted images + const string imagesDir = "extracted_images"; + Directory.CreateDirectory(imagesDir); + + // Collect paths of extracted images + List extractedImagePaths = new List(); + + // ---------- Extract images from PDF ---------- + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF + extractor.BindPdf(pdfPath); + + // Extract all images defined in resources (default mode) + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Save each image as PNG (extension can be any supported format) + string imagePath = Path.Combine(imagesDir, $"image_{imageIndex}.png"); + extractor.GetNextImage(imagePath); + extractedImagePaths.Add(imagePath); + imageIndex++; + } + } + + // ---------- Perform OCR on each extracted image ---------- + // Replace with your actual OpenAI API key + const string openAiApiKey = "YOUR_API_KEY"; + + // Create the OpenAI client (no disposal required) + var openAiClient = OpenAIClient + .CreateWithApiKey(openAiApiKey) + .Build(); + + foreach (string imagePath in extractedImagePaths) + { + // Configure OCR options for the current image + var ocrOptions = OpenAIOcrCopilotOptions + .Create() + .WithDocument(imagePath); // Add the image file to the OCR request + + // Create the OCR copilot instance + IOcrCopilot ocrCopilot = AICopilotFactory.CreateOcrCopilot(openAiClient, ocrOptions); + + // Execute OCR asynchronously – the method returns IReadOnlyList + var ocrResults = await ocrCopilot.GetTextRecognitionResultAsync(); + + // Output recognized text (if any) + if (ocrResults.Count > 0 && ocrResults[0].OcrDetails.Count > 0) + { + string extractedText = ocrResults[0].OcrDetails[0].ExtractedText; + Console.WriteLine($"OCR result for '{Path.GetFileName(imagePath)}':"); + Console.WriteLine(extractedText); + Console.WriteLine(new string('-', 40)); + } + else + { + Console.WriteLine($"No OCR text found for '{Path.GetFileName(imagePath)}'."); + } + } + } +} diff --git a/facades-extract-images-and-text/extract-images-original-format.cs b/facades-extract-images-and-text/extract-images-original-format.cs new file mode 100644 index 00000000..5e66b03e --- /dev/null +++ b/facades-extract-images-and-text/extract-images-original-format.cs @@ -0,0 +1,52 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; +using Aspose.Pdf; // for ExtractImageMode enum + +class Program +{ + static void Main() + { + const string inputPdf = "sample.pdf"; // source PDF + const string outputDir = "ExtractedImages"; // folder for extracted images + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure output directory exists + Directory.CreateDirectory(outputDir); + + // PdfExtractor implements IDisposable – use a using block for deterministic cleanup + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdf); + + // Set extraction mode to retrieve images exactly as they are stored in the PDF + extractor.ExtractImageMode = ExtractImageMode.DefinedInResources; + + // Start the extraction process + extractor.ExtractImage(); + + int imageIndex = 1; + // Iterate over all extracted images + while (extractor.HasNextImage()) + { + // Build output file name (original format is preserved by this overload) + string outputPath = Path.Combine(outputDir, $"image_{imageIndex}"); + + // The GetNextImage(string) overload saves the image in its original format. + // It automatically appends the appropriate file extension. + extractor.GetNextImage(outputPath); + + Console.WriteLine($"Extracted image {imageIndex} to {outputPath}"); + imageIndex++; + } + } + + Console.WriteLine("Image extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-pdf-csv-manifest.cs b/facades-extract-images-and-text/extract-images-pdf-csv-manifest.cs new file mode 100644 index 00000000..cd4169cf --- /dev/null +++ b/facades-extract-images-and-text/extract-images-pdf-csv-manifest.cs @@ -0,0 +1,65 @@ +using System; +using System.IO; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; // source PDF + const string imagesDirPath = "ExtractedImages"; // folder for images + const string csvManifestPath = "image_manifest.csv"; // CSV output + + // Verify input file exists + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdfPath}"); + return; + } + + // Ensure the images output directory exists + Directory.CreateDirectory(imagesDirPath); + + // Open the PDF document (1‑based page indexing) + using (Document doc = new Document(inputPdfPath)) + { + // Prepare CSV writer + using (StreamWriter csvWriter = new StreamWriter(csvManifestPath, false)) + { + // Write CSV header + csvWriter.WriteLine("FileName,PageNumber,Width,Height"); + + // Iterate over each page + foreach (Page page in doc.Pages) + { + int pageNumber = page.Number; // 1‑based page number + + int imageIndex = 1; // counter per page + + // Iterate over all images on the current page + foreach (XImage img in page.Resources.Images) + { + // Build a unique file name for the extracted image + string fileName = $"page{pageNumber}_img{imageIndex}.png"; + string filePath = Path.Combine(imagesDirPath, fileName); + + // Save the image to disk. + // XImage provides a Save method that accepts a Stream. + using (FileStream fs = new FileStream(filePath, FileMode.Create, FileAccess.Write)) + { + img.Save(fs); + } + + // Write a line to the CSV manifest + csvWriter.WriteLine($"{fileName},{pageNumber},{img.Width},{img.Height}"); + + imageIndex++; + } + } + } + } + + Console.WriteLine($"Image extraction complete. Manifest saved to '{csvManifestPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-pdf-to-base64-json.cs b/facades-extract-images-and-text/extract-images-pdf-to-base64-json.cs new file mode 100644 index 00000000..48dbd8ef --- /dev/null +++ b/facades-extract-images-and-text/extract-images-pdf-to-base64-json.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using System.Text.Json; +using Aspose.Pdf.Facades; + +class Program +{ + // Simple DTO for JSON output + private class ImageInfo + { + public int Index { get; set; } // Sequential index of the image + public string Base64 { get; set; } // Base64 representation of the image bytes + public string Format { get; set; } // Image format (default is JPEG) + } + + static void Main() + { + const string inputPdf = "input.pdf"; // Path to the source PDF + const string outputJson = "images.json"; // Path where JSON will be written + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + var images = new List(); + int imageIndex = 1; + + // PdfExtractor implements IDisposable, so wrap it in a using block + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdf); + + // Instruct the extractor to process images + extractor.ExtractImage(); + + // Iterate over all extracted images + while (extractor.HasNextImage()) + { + // Retrieve the next image into a memory stream (default format is JPEG) + using (MemoryStream imgStream = new MemoryStream()) + { + extractor.GetNextImage(imgStream); + + // Ensure the stream position is at the beginning before reading + imgStream.Position = 0; + + // Convert the image bytes to a Base64 string + string base64 = Convert.ToBase64String(imgStream.ToArray()); + + // Store metadata and the Base64 data + images.Add(new ImageInfo + { + Index = imageIndex++, + Base64 = base64, + Format = "jpeg" // Default format used by GetNextImage() + }); + } + } + } + + // Serialize the list of images to JSON + string json = JsonSerializer.Serialize(images, new JsonSerializerOptions { WriteIndented = true }); + + // Write JSON to the output file + File.WriteAllText(outputJson, json, Encoding.UTF8); + + Console.WriteLine($"Extracted {images.Count} image(s) and saved JSON to '{outputJson}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-pdf-to-word.cs b/facades-extract-images-and-text/extract-images-pdf-to-word.cs new file mode 100644 index 00000000..9dd8fc4b --- /dev/null +++ b/facades-extract-images-and-text/extract-images-pdf-to-word.cs @@ -0,0 +1,307 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; // PdfExtractor +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; // WordprocessingDocument +using DocumentFormat.OpenXml.Wordprocessing; // Document, Body, Paragraph, Run +using DocumentFormat.OpenXml.Drawing; // Drawing elements +using DocumentFormat.OpenXml.Drawing.Wordprocessing; +using DocumentFormat.OpenXml.Drawing.Pictures; + +class Program +{ + static void Main() + { + const string pdfPath = "input.pdf"; // source PDF + const string docxPath = "output.docx"; // target Word document + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {pdfPath}"); + return; + } + + // Create a new WordprocessingDocument (DOCX) inside a using block for deterministic disposal + using (WordprocessingDocument wordDoc = WordprocessingDocument.Create(docxPath, WordprocessingDocumentType.Document)) + { + // Add the main document part and initialise an empty body + MainDocumentPart mainPart = wordDoc.AddMainDocumentPart(); + mainPart.Document = new Document(new Body()); + + // Initialise the PDF extractor facade + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); // bind the source PDF + extractor.ExtractImage(); // start image extraction + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Retrieve the next image into a memory stream + using (MemoryStream imgStream = new MemoryStream()) + { + extractor.GetNextImage(imgStream); + imgStream.Position = 0; // reset for reading + + // Add the image as an ImagePart to the Word document + // Assume JPEG; if needed, detect format and use the appropriate ImagePartType + ImagePart imagePart = mainPart.AddImagePart(ImagePartType.Jpeg); + imagePart.FeedData(imgStream); + + // Insert the picture into the document body + AddPictureToBody(mainPart.GetIdOfPart(imagePart), mainPart.Document.Body, imageIndex); + } + + imageIndex++; + } + + extractor.Close(); // release resources held by the facade + } + + // Save the Word document + mainPart.Document.Save(); + } + + Console.WriteLine($"Images extracted from '{pdfPath}' and embedded into '{docxPath}'."); + } + + // Helper method to create a picture drawing and append it to the document body + private static void AddPictureToBody(string relationshipId, Body body, int imageNumber) + { + // Define the drawing element that references the image part + Drawing element = new Drawing( + new Inline( + new Extent() { Cx = 990000L, Cy = 792000L }, // size in EMUs (adjust as needed) + new EffectExtent() + { + LeftEdge = 0L, + TopEdge = 0L, + RightEdge = 0L, + BottomEdge = 0L + }, + new DocProperties() + { + Id = (UInt32Value)(uint)imageNumber, + Name = $"Picture {imageNumber}" + }, + new NonVisualGraphicFrameDrawingProperties( + new GraphicFrameLocks() { NoChangeAspect = true }), + new Graphic( + new GraphicData( + new Picture( + new NonVisualPictureProperties( + new NonVisualDrawingProperties() + { + Id = (UInt32Value)(uint)imageNumber, + Name = $"Image{imageNumber}.jpg" + }, + new NonVisualPictureDrawingProperties()), + new BlipFill( + new Blip() { Embed = relationshipId }, + new Stretch(new FillRectangle())), + new ShapeProperties( + new Transform2D( + new Offset() { X = 0L, Y = 0L }, + new Extents() { Cx = 990000L, Cy = 792000L }) + ) + ) + ) { Uri = "http://schemas.openxmlformats.org/drawingml/2006/picture" }) + ) + { + DistanceFromTop = (UInt32Value)0U, + DistanceFromBottom = (UInt32Value)0U, + DistanceFromLeft = (UInt32Value)0U, + DistanceFromRight = (UInt32Value)0U, + EditId = "50D07946" + }); + + // Wrap the drawing in a Run -> Paragraph and add to the body + Paragraph paragraph = new Paragraph(new Run(element)); + body.Append(paragraph); + } +} + +// --------------------------------------------------------------------------- +// Minimal stubs for the Open XML SDK types used in this example. +// In a real project you should reference the official DocumentFormat.OpenXml +// NuGet package (version 2.15 or later). The stubs below allow the code to +// compile in environments where the package is not available. +// --------------------------------------------------------------------------- +namespace DocumentFormat.OpenXml +{ + public abstract class OpenXmlElement { } + public class UInt32Value : OpenXmlElement + { + private uint _value; + public static implicit operator UInt32Value(uint v) => new UInt32Value { _value = v }; + public static implicit operator uint(UInt32Value v) => v._value; + } +} + +namespace DocumentFormat.OpenXml.Packaging +{ + public enum WordprocessingDocumentType { Document } + public class WordprocessingDocument : IDisposable + { + private readonly string _path; + private readonly WordprocessingDocumentType _type; + private MainDocumentPart _mainPart; + private WordprocessingDocument(string path, WordprocessingDocumentType type) + { + _path = path; _type = type; + } + public static WordprocessingDocument Create(string path, WordprocessingDocumentType type) => new WordprocessingDocument(path, type); + public MainDocumentPart AddMainDocumentPart() + { + _mainPart = new MainDocumentPart(); + return _mainPart; + } + public void Dispose() { /* In a real implementation the file would be written here */ } + } + + public class MainDocumentPart + { + public Document Document { get; set; } + public ImagePart AddImagePart(ImagePartType type) => new ImagePart(); + public string GetIdOfPart(ImagePart part) => "rId" + Guid.NewGuid().ToString("N"); + } + + public enum ImagePartType { Jpeg } + public class ImagePart + { + public void FeedData(Stream stream) { /* store the stream data */ } + } +} + +namespace DocumentFormat.OpenXml.Wordprocessing +{ + using DocumentFormat.OpenXml; + public class Document : OpenXmlElement + { + public Body Body { get; } + public Document(Body body) { Body = body; } + public void Save() { /* In a real implementation the document XML would be written */ } + } + public class Body : OpenXmlElement + { + public void Append(Paragraph p) { /* add paragraph to body */ } + } + public class Paragraph : OpenXmlElement + { + public Paragraph(params OpenXmlElement[] children) { } + } + public class Run : OpenXmlElement + { + public Run(params OpenXmlElement[] children) { } + } +} + +namespace DocumentFormat.OpenXml.Drawing +{ + using DocumentFormat.OpenXml; + public class Drawing : OpenXmlElement + { + public Drawing(params OpenXmlElement[] children) { } + } + public class Inline : OpenXmlElement + { + public Inline(Extent extent, EffectExtent effectExtent, DocProperties docProperties, + NonVisualGraphicFrameDrawingProperties nonVisualGraphicFrameDrawingProperties, + Graphic graphic) + { } + public UInt32Value DistanceFromTop { get; set; } + public UInt32Value DistanceFromBottom { get; set; } + public UInt32Value DistanceFromLeft { get; set; } + public UInt32Value DistanceFromRight { get; set; } + public string EditId { get; set; } + } + public class Extent : OpenXmlElement + { + public long Cx { get; set; } + public long Cy { get; set; } + } + public class EffectExtent : OpenXmlElement + { + public long LeftEdge { get; set; } + public long TopEdge { get; set; } + public long RightEdge { get; set; } + public long BottomEdge { get; set; } + } + public class DocProperties : OpenXmlElement + { + public UInt32Value Id { get; set; } + public string Name { get; set; } + } + public class NonVisualGraphicFrameDrawingProperties : OpenXmlElement + { + public NonVisualGraphicFrameDrawingProperties(GraphicFrameLocks locks) { } + } + public class GraphicFrameLocks : OpenXmlElement + { + public bool NoChangeAspect { get; set; } + } + public class Graphic : OpenXmlElement + { + public Graphic(GraphicData data) { } + } + public class GraphicData : OpenXmlElement + { + public GraphicData(Picture picture) { } + public string Uri { get; set; } + } +} + +namespace DocumentFormat.OpenXml.Drawing.Pictures +{ + using DocumentFormat.OpenXml; + public class Picture : OpenXmlElement + { + public Picture(NonVisualPictureProperties nonVisualPictureProperties, BlipFill blipFill, ShapeProperties shapeProperties) { } + } + public class NonVisualPictureProperties : OpenXmlElement + { + public NonVisualPictureProperties(NonVisualDrawingProperties nonVisualDrawingProperties, NonVisualPictureDrawingProperties nonVisualPictureDrawingProperties) { } + } + public class NonVisualDrawingProperties : OpenXmlElement + { + public UInt32Value Id { get; set; } + public string Name { get; set; } + } + public class NonVisualPictureDrawingProperties : OpenXmlElement { } + public class BlipFill : OpenXmlElement + { + public BlipFill(Blip blip, Stretch stretch) { } + } + public class Blip : OpenXmlElement + { + public string Embed { get; set; } + } + public class Stretch : OpenXmlElement + { + public Stretch(FillRectangle fillRect) { } + } + public class FillRectangle : OpenXmlElement { } + public class ShapeProperties : OpenXmlElement + { + public ShapeProperties(Transform2D transform) { } + } + public class Transform2D : OpenXmlElement + { + public Transform2D(Offset offset, Extents extents) { } + } + public class Offset : OpenXmlElement + { + public long X { get; set; } + public long Y { get; set; } + } + public class Extents : OpenXmlElement + { + public long Cx { get; set; } + public long Cy { get; set; } + } +} + +namespace DocumentFormat.OpenXml.Drawing.Wordprocessing +{ + // No additional types needed for this example. +} diff --git a/facades-extract-images-and-text/extract-images-pdf-upload-s3.cs b/facades-extract-images-and-text/extract-images-pdf-upload-s3.cs new file mode 100644 index 00000000..5e0a1330 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-pdf-upload-s3.cs @@ -0,0 +1,115 @@ +using System; +using System.IO; +using System.Drawing.Imaging; +using Aspose.Pdf.Facades; + +// --------------------------------------------------------------------------- +// Minimal stubs for AWS SDK types when the real SDK is not referenced. +// These allow the project to compile and run (the upload will be a no‑op). +// If the AWS SDK NuGet package is added, the real types will be used instead. +// --------------------------------------------------------------------------- +#if !AWS_SDK_PRESENT +namespace Amazon +{ + public class RegionEndpoint + { + public static RegionEndpoint GetBySystemName(string name) => new RegionEndpoint(); + } +} + +namespace Amazon.S3 +{ + using System.Threading.Tasks; + + public class AmazonS3Client + { + public AmazonS3Client(Amazon.RegionEndpoint endpoint) { } + public Task PutObjectAsync(PutObjectRequest request) => Task.CompletedTask; + } + + public class PutObjectRequest + { + public string BucketName { get; set; } + public string Key { get; set; } + public string FilePath { get; set; } + public string ContentType { get; set; } + } +} +#endif + +class Program +{ + static void Main(string[] args) + { + // Input PDF path (first argument or default) + string pdfPath = args.Length > 0 ? args[0] : "input.pdf"; + + // Target S3 bucket name (second argument or default) + string bucketName = args.Length > 1 ? args[1] : "my-bucket"; + + // AWS region (third argument or default) + string regionName = args.Length > 2 ? args[2] : "us-east-1"; + + // Initialise S3 client – works with real SDK or the stub above. + var s3Client = new Amazon.S3.AmazonS3Client(Amazon.RegionEndpoint.GetBySystemName(regionName)); + + // Ensure the PDF file exists + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {pdfPath}"); + return; + } + + // Extract images using Aspose.Pdf.Facades.PdfExtractor + using (var extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); + extractor.ExtractImage(); // Prepare extraction + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Create a temporary file for the extracted image + string tempFile = Path.GetTempFileName(); + + // Save the next image as PNG (you can choose other formats) + extractor.GetNextImage(tempFile, ImageFormat.Png); + + // Build S3 object key (path within the bucket) + string key = $"images/{Path.GetFileNameWithoutExtension(pdfPath)}_image_{imageIndex}.png"; + + // Prepare the upload request + var putRequest = new Amazon.S3.PutObjectRequest + { + BucketName = bucketName, + Key = key, + FilePath = tempFile, + ContentType = "image/png" + }; + + // Upload synchronously (blocking) + try + { + s3Client.PutObjectAsync(putRequest).GetAwaiter().GetResult(); + Console.WriteLine($"Uploaded image {imageIndex} to s3://{bucketName}/{key}"); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to upload image {imageIndex}: {ex.Message}"); + } + finally + { + // Clean up the temporary file + if (File.Exists(tempFile)) + { + File.Delete(tempFile); + } + } + + imageIndex++; + } + } + + Console.WriteLine("Image extraction and upload completed."); + } +} diff --git a/facades-extract-images-and-text/extract-images-to-html-report.cs b/facades-extract-images-and-text/extract-images-to-html-report.cs new file mode 100644 index 00000000..5617356a --- /dev/null +++ b/facades-extract-images-and-text/extract-images-to-html-report.cs @@ -0,0 +1,76 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; + const string outputHtmlPath = "report.html"; + + // Ensure the input file exists + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdfPath}"); + return; + } + + // StringBuilder to compose the HTML report + StringBuilder htmlBuilder = new StringBuilder(); + + // Basic HTML skeleton + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(" "); + htmlBuilder.AppendLine(" Extracted Images Report"); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine("

Images extracted from PDF

"); + + // Use PdfExtractor (Facade) to extract images + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF file + extractor.BindPdf(inputPdfPath); + + // Start the image extraction process + extractor.ExtractImage(); + + int imageIndex = 1; + + // Iterate over all extracted images + while (extractor.HasNextImage()) + { + // Retrieve the next image into a memory stream + using (MemoryStream imageStream = new MemoryStream()) + { + extractor.GetNextImage(imageStream); // default format is JPEG + byte[] imageBytes = imageStream.ToArray(); + + // Convert image bytes to a Base64 string + string base64Image = Convert.ToBase64String(imageBytes); + + // Embed the image using a data URI (assumed JPEG) + htmlBuilder.AppendLine("
"); + htmlBuilder.AppendLine($"

Image {imageIndex}

"); + htmlBuilder.AppendLine($" \"Image"); + htmlBuilder.AppendLine("
"); + } + + imageIndex++; + } + } + + // Close HTML tags + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + + // Write the HTML report to disk + File.WriteAllText(outputHtmlPath, htmlBuilder.ToString()); + + Console.WriteLine($"HTML report with embedded images saved to '{outputHtmlPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-to-tiff.cs b/facades-extract-images-and-text/extract-images-to-tiff.cs new file mode 100644 index 00000000..7e68e1a3 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-to-tiff.cs @@ -0,0 +1,57 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; // Facade APIs for extraction and conversion +using System.Drawing.Imaging; // ImageFormat enum for GetNextImage +using System.Runtime.Versioning; // For platform‑specific attribute + +class Program +{ + // Suppress the CA1416 warning because TIFF support is Windows‑only. + // The code will run on Windows platforms where System.Drawing is fully supported. + [SupportedOSPlatform("windows")] + static void Main() + { + const string inputPdf = "input.pdf"; // Source PDF containing images + const string outputDir = "ExtractedImages"; // Folder to store TIFF files + + // Verify that the source PDF exists before proceeding. + if (!File.Exists(inputPdf)) + { + Console.WriteLine($"Error: The file '{inputPdf}' was not found. Please place the PDF in the executable's directory or provide a correct path."); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputDir); + + // Use PdfExtractor (Facade) to pull images out of the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdf); + + // Optional: set a higher resolution for better quality (default is 150 DPI) + extractor.Resolution = 300; + + // Tell the extractor that we only want images + extractor.ExtractImage(); + + int imageIndex = 1; + + // Loop through all extracted images + while (extractor.HasNextImage()) + { + // Build a file name for each image (e.g., image-1.tiff, image-2.tiff, ...) + string outputPath = Path.Combine(outputDir, $"image-{imageIndex}.tiff"); + + // Save the current image directly as TIFF (lossless) + // ImageFormat.Tiff is Windows‑only; the method is marked with SupportedOSPlatform("windows") + extractor.GetNextImage(outputPath, ImageFormat.Tiff); + + imageIndex++; + } + } + + Console.WriteLine($"Image extraction complete. TIFF files saved to '{outputDir}'."); + } +} diff --git a/facades-extract-images-and-text/extract-images-to-unc-share.cs b/facades-extract-images-and-text/extract-images-to-unc-share.cs new file mode 100644 index 00000000..9f627653 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-to-unc-share.cs @@ -0,0 +1,64 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + // Local PDF file containing images to extract + const string inputPdfPath = @"C:\Docs\sample.pdf"; + + // Network share UNC path where images will be saved + // Ensure the path starts with double backslashes and does not end with a backslash + const string uncBasePath = @"\\ServerName\SharedFolder\ExtractedImages"; + + // Validate input file existence + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdfPath}"); + return; + } + + // Ensure the UNC directory exists; create it if necessary + try + { + Directory.CreateDirectory(uncBasePath); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to create UNC directory '{uncBasePath}': {ex.Message}"); + return; + } + + // Use PdfExtractor (facade) to extract images + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF document + extractor.BindPdf(inputPdfPath); + + // Extract all images from the document + extractor.ExtractImage(); + + int imageIndex = 1; + // Loop through each extracted image + while (extractor.HasNextImage()) + { + // Build a file name for each image (e.g., image-1.jpg) + // PdfExtractor.GetNextImage defaults to JPEG format if no format is specified + string fileName = $"image-{imageIndex}.jpg"; + + // Combine UNC base path with file name + string outputPath = Path.Combine(uncBasePath, fileName); + + // Save the current image to the UNC location + extractor.GetNextImage(outputPath); + + Console.WriteLine($"Saved image {imageIndex} to '{outputPath}'"); + imageIndex++; + } + } + + Console.WriteLine("Image extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-images-to-unc-share__v2.cs b/facades-extract-images-and-text/extract-images-to-unc-share__v2.cs new file mode 100644 index 00000000..15b934d1 --- /dev/null +++ b/facades-extract-images-and-text/extract-images-to-unc-share__v2.cs @@ -0,0 +1,53 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + // NOTE: Include the string[] args parameter so the compiler can locate the entry point. + public static void Main(string[] args) + { + // UNC path to the source PDF + const string inputPdf = @"\\server\share\input\sample.pdf"; + + // UNC folder where extracted images will be saved + const string outputFolder = @"\\server\share\output\images"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdf}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputFolder); + + // Use PdfExtractor to extract images + // The null‑forgiving operator (!) silences the nullable warning for the disposable instance. + using (PdfExtractor extractor = new PdfExtractor()!) + { + // Bind the PDF file + extractor.BindPdf(inputPdf); + + // Prepare for image extraction + extractor.ExtractImage(); + + int imageIndex = 1; + // Loop through all images in the PDF + while (extractor.HasNextImage()) + { + // Build the output file name (e.g., image-1.png, image-2.png, ...) + string outputFile = Path.Combine(outputFolder, $"image-{imageIndex}.png"); + + // Extract the current image. Use the overload that does not require ImageFormat + // (the image will be saved in its original format; if conversion is needed, + // post‑process the file with a separate image library). + extractor.GetNextImage(outputFile); + + imageIndex++; + } + } + + Console.WriteLine("Image extraction completed."); + } +} diff --git a/facades-extract-images-and-text/extract-images-validate-signature.cs b/facades-extract-images-and-text/extract-images-validate-signature.cs new file mode 100644 index 00000000..dce458fb --- /dev/null +++ b/facades-extract-images-and-text/extract-images-validate-signature.cs @@ -0,0 +1,116 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + // Known file signatures (magic numbers) for common image formats + private static readonly byte[] PngSignature = new byte[] { 0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A }; + private static readonly byte[] JpegSignature = new byte[] { 0xFF, 0xD8 }; + private static readonly byte[] GifSignature = new byte[] { 0x47, 0x49, 0x46, 0x38 }; + private static readonly byte[] BmpSignature = new byte[] { 0x42, 0x4D }; + + static void Main() + { + const string inputPdf = "sample.pdf"; + const string outputDir = "ExtractedImages"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdf}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputDir); + + // Use PdfExtractor (facade) to extract images + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF file + extractor.BindPdf(inputPdf); + + // Prepare the extractor for image extraction + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Build a file name for the extracted image + string imagePath = Path.Combine(outputDir, $"image_{imageIndex}.png"); + + // Save the next image to the file system + extractor.GetNextImage(imagePath); + + // Verify the saved image by checking its file signature (magic number) + if (IsFileSignatureValid(imagePath, out string format)) + { + Console.WriteLine($"Image {imageIndex}: valid {format} file saved to '{imagePath}'."); + } + else + { + Console.WriteLine($"Image {imageIndex}: corrupted or unknown format at '{imagePath}'."); + } + + imageIndex++; + } + } + } + + // Checks the file's magic number against known image signatures. + // Returns true if a known signature matches; also outputs the detected format. + private static bool IsFileSignatureValid(string filePath, out string format) + { + format = "unknown"; + + // Read enough bytes to cover the longest signature we check (PNG = 8 bytes) + byte[] header = new byte[8]; + int bytesRead = 0; + try + { + using (FileStream fs = new FileStream(filePath, FileMode.Open, FileAccess.Read)) + { + bytesRead = fs.Read(header, 0, header.Length); + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"Failed to read file '{filePath}': {ex.Message}"); + return false; + } + + if (bytesRead >= PngSignature.Length && StartsWith(header, PngSignature)) + { + format = "PNG"; + return true; + } + if (bytesRead >= JpegSignature.Length && StartsWith(header, JpegSignature)) + { + format = "JPEG"; + return true; + } + if (bytesRead >= GifSignature.Length && StartsWith(header, GifSignature)) + { + format = "GIF"; + return true; + } + if (bytesRead >= BmpSignature.Length && StartsWith(header, BmpSignature)) + { + format = "BMP"; + return true; + } + + return false; + } + + // Helper to compare the start of a byte array with a signature pattern + private static bool StartsWith(byte[] source, byte[] signature) + { + if (source.Length < signature.Length) return false; + for (int i = 0; i < signature.Length; i++) + { + if (source[i] != signature[i]) return false; + } + return true; + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-attachments-by-extension.cs b/facades-extract-images-and-text/extract-pdf-attachments-by-extension.cs new file mode 100644 index 00000000..3df8dcb6 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-attachments-by-extension.cs @@ -0,0 +1,80 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + // Input PDF containing attachments + const string inputPdfPath = "input.pdf"; + + // Root folder where attachments will be saved + const string outputRootFolder = "ExtractedAttachments"; + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input file not found: {inputPdfPath}"); + return; + } + + // Ensure the root output folder exists + Directory.CreateDirectory(outputRootFolder); + + // Use PdfExtractor facade to work with attachments + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF + extractor.BindPdf(inputPdfPath); + + // Extract attachment information from the PDF + extractor.ExtractAttachment(); + + // Get the list of attachment file names + IList attachmentNames = extractor.GetAttachNames(); + + // Retrieve all attachment streams + MemoryStream[] attachmentStreams = extractor.GetAttachment(); + + // Iterate over each attachment and save it to the appropriate subfolder + for (int i = 0; i < attachmentStreams.Length; i++) + { + string name = attachmentNames[i]; + MemoryStream stream = attachmentStreams[i]; + + // Determine the file extension (e.g., "pdf", "png") + string extension = Path.GetExtension(name); + if (string.IsNullOrEmpty(extension)) + { + // If no extension, place it in a folder named "no_extension" + extension = "no_extension"; + } + else + { + // Remove the leading dot + extension = extension.TrimStart('.').ToLowerInvariant(); + } + + // Create subfolder for this extension + string extensionFolder = Path.Combine(outputRootFolder, extension); + Directory.CreateDirectory(extensionFolder); + + // Full path for the extracted file + string outputPath = Path.Combine(extensionFolder, name); + + // Write the stream to disk + using (FileStream fileStream = new FileStream(outputPath, FileMode.Create, FileAccess.Write)) + { + // Reset stream position to the beginning + stream.Position = 0; + stream.CopyTo(fileStream); + } + + Console.WriteLine($"Extracted: {outputPath}"); + } + } + + Console.WriteLine("Attachment extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-attachments-rename-timestamp.cs b/facades-extract-images-and-text/extract-pdf-attachments-rename-timestamp.cs new file mode 100644 index 00000000..1a40827c --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-attachments-rename-timestamp.cs @@ -0,0 +1,73 @@ +using System; +using System.Collections.Generic; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main(string[] args) + { + // Path to the source PDF – you can also pass it as a command‑line argument. + const string inputPdfPath = "input.pdf"; // Ensure this file exists in the working directory. + const string outputDirectory = "ExtractedAttachments"; // Folder to store renamed files. + + // Verify that the source PDF exists before attempting extraction. + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Error: The PDF file '{inputPdfPath}' was not found. Please provide a valid path."); + return; + } + + // Ensure the output folder exists. + Directory.CreateDirectory(outputDirectory); + + try + { + // Use PdfExtractor (Facade) to extract attachments. + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF. + extractor.BindPdf(inputPdfPath); + + // Extract all attachments from the document. + extractor.ExtractAttachment(); + + // Retrieve attachment names. + IList attachmentNames = extractor.GetAttachNames(); + + // Retrieve attachment streams (one stream per attachment). + MemoryStream[] attachmentStreams = extractor.GetAttachment(); + + // Guard against a mismatch between names and streams. + int count = Math.Min(attachmentNames.Count, attachmentStreams.Length); + + for (int i = 0; i < count; i++) + { + // Original attachment file name. + string originalName = attachmentNames[i]; + + // Build a timestamp prefix (yyyyMMddHHmmssfff) and ensure uniqueness with the loop index. + string timestamp = DateTime.Now.ToString("yyyyMMddHHmmssfff"); + string newFileName = $"{timestamp}_{i}_{originalName}"; + + // Full path for the renamed attachment. + string outputPath = Path.Combine(outputDirectory, newFileName); + + // Write the stream to disk. + using (FileStream fileStream = new FileStream(outputPath, FileMode.Create, FileAccess.Write)) + { + // Reset stream position to the beginning. + attachmentStreams[i].Position = 0; + attachmentStreams[i].CopyTo(fileStream); + } + + Console.WriteLine($"Saved attachment as: {outputPath}"); + } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"An error occurred while extracting attachments: {ex.Message}"); + } + } +} diff --git a/facades-extract-images-and-text/extract-pdf-attachments-sha256.cs b/facades-extract-images-and-text/extract-pdf-attachments-sha256.cs new file mode 100644 index 00000000..40cc5111 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-attachments-sha256.cs @@ -0,0 +1,79 @@ +using System; +using System.IO; +using System.Collections.Generic; +using System.Security.Cryptography; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main(string[] args) + { + // Allow the PDF path to be supplied via command‑line arguments; fall back to the default name. + string pdfPath = args.Length > 0 ? args[0] : "input.pdf"; + const string outputDir = "attachments"; + + // Verify that the source PDF exists before attempting to bind it. + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"Error: PDF file not found – '{pdfPath}'. Please provide a valid path."); + return; + } + + // Ensure the output directory exists. + Directory.CreateDirectory(outputDir); + + // Initialize the extractor and bind the PDF. + PdfExtractor extractor = new PdfExtractor(); + extractor.BindPdf(pdfPath); + + // Extract all attachments from the PDF. + extractor.ExtractAttachment(); + + // Retrieve attachment names and their data streams. + IList attachmentNames = extractor.GetAttachNames(); + MemoryStream[] attachmentStreams = extractor.GetAttachment(); + + if (attachmentNames == null || attachmentStreams == null || attachmentNames.Count == 0) + { + Console.WriteLine("No attachments were found in the PDF."); + return; + } + + // Process each attachment. + for (int i = 0; i < attachmentStreams.Length; i++) + { + // Get the name; fall back to a generated name if the entry is null or empty. + string name = !string.IsNullOrEmpty(attachmentNames[i]) + ? attachmentNames[i] + : $"attachment_{i}"; + + string outPath = Path.Combine(outputDir, name); + + // Save the attachment to disk. + using (FileStream fileStream = new FileStream(outPath, FileMode.Create, FileAccess.Write)) + { + attachmentStreams[i].Position = 0; + attachmentStreams[i].CopyTo(fileStream); + } + + // Compute SHA‑256 hash of the saved file. + string hash = ComputeSha256(outPath); + Console.WriteLine($"{name}: {hash}"); + } + } + + // Helper method to compute SHA‑256 hash of a file and return it as a hex string. + private static string ComputeSha256(string filePath) + { + using (FileStream fileStream = File.OpenRead(filePath)) + using (SHA256 sha256 = SHA256.Create()) + { + byte[] hashBytes = sha256.ComputeHash(fileStream); + StringBuilder sb = new StringBuilder(hashBytes.Length * 2); + foreach (byte b in hashBytes) + sb.Append(b.ToString("x2")); + return sb.ToString(); + } + } +} diff --git a/facades-extract-images-and-text/extract-pdf-attachments.cs b/facades-extract-images-and-text/extract-pdf-attachments.cs new file mode 100644 index 00000000..b3a78080 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-attachments.cs @@ -0,0 +1,59 @@ +using System; +using System.IO; +using System.Collections.Generic; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputDir = "Attachments"; + + // Verify input PDF exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputDir); + + try + { + // Create a PdfExtractor and bind the source PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + + // Extract all attachments from the PDF + extractor.ExtractAttachment(); + + // Retrieve attachment names and their corresponding streams + IList names = extractor.GetAttachNames(); + MemoryStream[] streams = extractor.GetAttachment(); + + // Write each attachment to the output directory + for (int i = 0; i < streams.Length; i++) + { + string fileName = names[i]; + string outPath = Path.Combine(outputDir, fileName); + + using (FileStream fs = new FileStream(outPath, FileMode.Create, FileAccess.Write)) + { + MemoryStream ms = streams[i]; + ms.Position = 0; // Reset stream position + ms.CopyTo(fs); // Write stream content to file + } + } + } + + Console.WriteLine("All attachments extracted successfully."); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error: {ex.Message}"); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-images-add-watermark.cs b/facades-extract-images-and-text/extract-pdf-images-add-watermark.cs new file mode 100644 index 00000000..5685e258 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-add-watermark.cs @@ -0,0 +1,93 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; // PdfExtractor +using System.Drawing; // Bitmap, Graphics, Font, Brush, Color +using System.Drawing.Imaging; // ImageFormat + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; // source PDF + const string outputDir = "ExtractedImages"; // folder for watermarked images + const string watermark = "Sample Watermark"; // text to overlay + + // Ensure output directory exists + Directory.CreateDirectory(outputDir); + + // Validate input file + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + try + { + // ----------------------------------------------------------------- + // 1. Extract images from the PDF using PdfExtractor (Facades API) + // ----------------------------------------------------------------- + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file + extractor.BindPdf(inputPdf); + + // Extract all images from the document + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Retrieve the next image into a memory stream (default JPEG format) + using (MemoryStream imgStream = new MemoryStream()) + { + extractor.GetNextImage(imgStream); + imgStream.Position = 0; // reset for reading + + // --------------------------------------------------------- + // 2. Load the image into System.Drawing.Bitmap for watermarking + // --------------------------------------------------------- + using (Bitmap bitmap = new Bitmap(imgStream)) + { + // Create graphics object for drawing + using (Graphics graphics = Graphics.FromImage(bitmap)) + { + // Set high quality rendering options + graphics.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.AntiAlias; + graphics.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.HighQualityBicubic; + graphics.PixelOffsetMode = System.Drawing.Drawing2D.PixelOffsetMode.HighQuality; + + // Define watermark appearance + Font watermarkFont = new Font("Arial", 36, System.Drawing.FontStyle.Bold, GraphicsUnit.Pixel); + Color watermarkColor = Color.FromArgb(128, 255, 255, 255); // semi‑transparent white + using (Brush brush = new SolidBrush(watermarkColor)) + { + // Position watermark at the center of the image + SizeF textSize = graphics.MeasureString(watermark, watermarkFont); + float x = (bitmap.Width - textSize.Width) / 2f; + float y = (bitmap.Height - textSize.Height) / 2f; + + // Draw the watermark text + graphics.DrawString(watermark, watermarkFont, brush, x, y); + } + } + + // --------------------------------------------------------- + // 3. Save the watermarked image to a file (PNG preserves transparency) + // --------------------------------------------------------- + string outputPath = Path.Combine(outputDir, $"image-{imageIndex}.png"); + bitmap.Save(outputPath, ImageFormat.Png); + Console.WriteLine($"Saved watermarked image: {outputPath}"); + } + } + + imageIndex++; + } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error: {ex.Message}"); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-images-as-bmp.cs b/facades-extract-images-and-text/extract-pdf-images-as-bmp.cs new file mode 100644 index 00000000..ef14aa6d --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-as-bmp.cs @@ -0,0 +1,47 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; +using System.Drawing.Imaging; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputFolder = "ExtractedImages"; + + // Verify input PDF exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputFolder); + + // Use PdfExtractor to pull images from the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF + extractor.BindPdf(inputPdf); + + // Extract all images; this keeps original resolution and color depth + extractor.ExtractImage(); + + int imageIndex = 1; + // Iterate through each extracted image + while (extractor.HasNextImage()) + { + string outputPath = Path.Combine(outputFolder, $"image_{imageIndex}.bmp"); + + // Save the image as BMP preserving its native properties + extractor.GetNextImage(outputPath, ImageFormat.Bmp); + + imageIndex++; + } + } + + Console.WriteLine("All images have been extracted and saved as BMP files."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-images-base64.cs b/facades-extract-images-and-text/extract-pdf-images-base64.cs new file mode 100644 index 00000000..c93034c7 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-base64.cs @@ -0,0 +1,103 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Drawing.Imaging; // ImageFormat for specifying output format +using Aspose.Pdf.Facades; // PdfExtractor resides here +using System.Text.Json; // Optional: for JSON serialization + +namespace AsposePdfApi +{ + public class PdfImageExtractor + { + /// + /// Extracts all images from a PDF file and returns them as Base64‑encoded strings. + /// Each image is extracted into a MemoryStream, converted to a byte array, + /// then encoded with Convert.ToBase64String. + /// + /// Full path to the source PDF. + /// List of Base64 strings, one per extracted image. + public static List ExtractImagesAsBase64(string pdfPath) + { + // Validate input + if (string.IsNullOrEmpty(pdfPath)) + throw new ArgumentException("PDF path must be provided.", nameof(pdfPath)); + if (!File.Exists(pdfPath)) + throw new FileNotFoundException("PDF file not found.", pdfPath); + + var base64Images = new List(); + + // PdfExtractor implements IDisposable – use a using block for deterministic cleanup + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Optional: set a higher resolution if higher‑quality images are required + // extractor.Resolution = 300; + + // Tell the extractor to look for images + extractor.ExtractImage(); + + // Iterate over all found images + while (extractor.HasNextImage()) + { + // Store the current image into a memory stream (PNG format for lossless result) + using (MemoryStream imageStream = new MemoryStream()) + { + // GetNextImage overload with ImageFormat allows us to choose PNG + // The ImageFormat type is Windows‑specific; suppress the CA1416 warning for this line. +#pragma warning disable CA1416 // Validate platform compatibility + extractor.GetNextImage(imageStream, ImageFormat.Png); +#pragma warning restore CA1416 // Validate platform compatibility + + // Reset stream position before reading + imageStream.Position = 0; + + // Convert the stream's bytes to a Base64 string + string base64 = Convert.ToBase64String(imageStream.ToArray()); + + base64Images.Add(base64); + } + } + } + + return base64Images; + } + + // Example usage: serialize the list to JSON for transmission + public static string GetImagesJson(string pdfPath) + { + var imagesBase64 = ExtractImagesAsBase64(pdfPath); + // Simple JSON array: ["data1","data2",...] + return JsonSerializer.Serialize(imagesBase64); + } + } + + // Entry point required for a console application + internal class Program + { + /// + /// Main method – expects a single argument: the full path to the PDF file. + /// Prints the JSON payload containing all extracted images as Base64 strings. + /// + static void Main(string[] args) + { + if (args.Length == 0) + { + Console.WriteLine("Usage: AsposePdfApi "); + return; + } + + string pdfPath = args[0]; + try + { + string jsonPayload = PdfImageExtractor.GetImagesJson(pdfPath); + Console.WriteLine(jsonPayload); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error: {ex.Message}"); + } + } + } +} diff --git a/facades-extract-images-and-text/extract-pdf-images-create-video-slideshow.cs b/facades-extract-images-and-text/extract-pdf-images-create-video-slideshow.cs new file mode 100644 index 00000000..129ba885 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-create-video-slideshow.cs @@ -0,0 +1,104 @@ +using System; +using System.IO; +using System.Diagnostics; +using System.Linq; +using Aspose.Pdf.Facades; +using System.Drawing.Imaging; + +class Program +{ + static void Main() + { + // Paths – adjust as needed + const string inputPdf = "input.pdf"; + const string imagesDir = "extracted_images"; + const string videoPath = "slideshow.mp4"; + + // Verify source PDF exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdf}"); + return; + } + + // Ensure output folder for images exists + Directory.CreateDirectory(imagesDir); + + // ---------- Extract images from PDF ---------- + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdf); + + // Prepare for image extraction + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Build a zero‑padded filename (image_001.png, image_002.png, …) + string imagePath = Path.Combine(imagesDir, $"image_{imageIndex:D3}.png"); + + // Extract the next image as PNG; GetNextImage returns true on success + bool extracted = extractor.GetNextImage(imagePath, ImageFormat.Png); + if (!extracted) + { + Console.Error.WriteLine($"Failed to extract image #{imageIndex}"); + } + + imageIndex++; + } + } + + // Verify that at least one image was extracted before invoking FFmpeg + if (!Directory.EnumerateFiles(imagesDir, "*.png").Any()) + { + Console.Error.WriteLine("No images were extracted – aborting video creation."); + return; + } + + // ---------- Generate video slideshow with FFmpeg ---------- + // Assumes ffmpeg is available in the system PATH + const string ffmpegExe = "ffmpeg"; + + // -framerate 1 → 1 image per second (adjust as required) + // -i image_%03d.png → input pattern matching the extracted files + // -c:v libx264 -r 30 -pix_fmt yuv420p → common encoding settings + string ffmpegArgs = $"-y -framerate 1 -i \"{Path.Combine(imagesDir, "image_%03d.png")}\" -c:v libx264 -r 30 -pix_fmt yuv420p \"{videoPath}\""; + + var startInfo = new ProcessStartInfo + { + FileName = ffmpegExe, + Arguments = ffmpegArgs, + UseShellExecute = false, + RedirectStandardOutput = true, + RedirectStandardError = true, + CreateNoWindow = true + }; + + try + { + using (Process ffmpeg = Process.Start(startInfo)) + { + string stdOut = ffmpeg.StandardOutput.ReadToEnd(); + string stdErr = ffmpeg.StandardError.ReadToEnd(); + ffmpeg.WaitForExit(); + + Console.WriteLine(stdOut); + if (ffmpeg.ExitCode != 0) + { + Console.Error.WriteLine($"FFmpeg exited with code {ffmpeg.ExitCode}"); + Console.Error.WriteLine(stdErr); + } + else + { + Console.WriteLine($"Video slideshow created successfully at: {videoPath}"); + } + } + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error executing FFmpeg: {ex.Message}"); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-images-html-gallery.cs b/facades-extract-images-and-text/extract-pdf-images-html-gallery.cs new file mode 100644 index 00000000..c3841953 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-html-gallery.cs @@ -0,0 +1,75 @@ +using System; +using System.IO; +using System.Text; +using System.Drawing.Imaging; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + // Input PDF file + const string pdfPath = "input.pdf"; + + // Folder where extracted images will be saved + const string imagesFolder = "extracted_images"; + + // Output HTML gallery file + const string htmlPath = "gallery.html"; + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {pdfPath}"); + return; + } + + // Ensure the images folder exists + Directory.CreateDirectory(imagesFolder); + + // Extract images using PdfExtractor (Facade API) + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + string imageFile = Path.Combine(imagesFolder, $"image-{imageIndex}.png"); + // Save each image as PNG + extractor.GetNextImage(imageFile, ImageFormat.Png); + imageIndex++; + } + } + + // Build a simple HTML gallery referencing the extracted images + StringBuilder htmlBuilder = new StringBuilder(); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(" "); + htmlBuilder.AppendLine(" PDF Image Gallery"); + htmlBuilder.AppendLine(" "); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine("

Extracted Images

"); + + // Add an tag for each extracted image file + foreach (string filePath in Directory.GetFiles(imagesFolder, "*.png")) + { + string fileName = Path.GetFileName(filePath); + htmlBuilder.AppendLine($" \"{fileName}\""); + } + + htmlBuilder.AppendLine(""); + htmlBuilder.AppendLine(""); + + // Write the HTML content to the output file + File.WriteAllText(htmlPath, htmlBuilder.ToString()); + + Console.WriteLine($"Extraction complete. Images saved to '{imagesFolder}'."); + Console.WriteLine($"HTML gallery generated at '{htmlPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-images-to-azure-blob.cs b/facades-extract-images-and-text/extract-pdf-images-to-azure-blob.cs new file mode 100644 index 00000000..e850ab7e --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-to-azure-blob.cs @@ -0,0 +1,115 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; // PdfExtractor +using Aspose.Pdf; // ImageFormat +using Azure.Storage.Blobs; // BlobContainerClient, BlobClient + +// --------------------------------------------------------------------------- +// Minimal stubs for Azure.Storage.Blobs when the real NuGet package is not +// referenced. They allow the sample to compile and run (the Upload method +// simply writes the stream to a local file in the current directory). +// Remove these stubs and add the official Azure.Storage.Blobs package for +// production use. +// --------------------------------------------------------------------------- +namespace Azure.Storage.Blobs +{ + public class BlobContainerClient + { + private readonly string _connectionString; + private readonly string _containerName; + public BlobContainerClient(string connectionString, string containerName) + { + _connectionString = connectionString; + _containerName = containerName; + } + public void CreateIfNotExists() + { + // No‑op stub – in real code this creates the container if missing. + } + public BlobClient GetBlobClient(string blobName) => new BlobClient(_containerName, blobName); + } + + public class BlobClient + { + private readonly string _containerName; + private readonly string _blobName; + public BlobClient(string containerName, string blobName) + { + _containerName = containerName; + _blobName = blobName; + } + public void Upload(Stream content, bool overwrite = false) + { + // Simple stub – writes the stream to a file named in the + // current working directory. Real implementation uploads to Azure. + string filePath = Path.Combine(Directory.GetCurrentDirectory(), _blobName); + using (var file = new FileStream(filePath, overwrite ? FileMode.Create : FileMode.CreateNew)) + { + content.CopyTo(file); + } + } + } +} + +class Program +{ + static void Main() + { + // Input PDF file path + const string pdfPath = "input.pdf"; + + // Azure Blob Storage connection details (stub values – replace with real ones) + const string azureConnectionString = "UseDevelopmentStorage=true"; // placeholder + const string containerName = "pdf-images"; + + // Validate input file existence + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {pdfPath}"); + return; + } + + // Initialize Azure Blob container client (stub implementation works without the real SDK) + BlobContainerClient containerClient = new BlobContainerClient(azureConnectionString, containerName); + containerClient.CreateIfNotExists(); + + try + { + // Initialize PdfExtractor and bind the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); + extractor.ExtractImage(); // Prepare image extraction + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Retrieve the next image into a memory stream (default format is JPEG) + using (MemoryStream imageStream = new MemoryStream()) + { + // Optionally specify a format, e.g., PNG: + // extractor.GetNextImage(imageStream, ImageFormat.Png); + extractor.GetNextImage(imageStream); // uses default format + imageStream.Position = 0; // Reset stream position for upload + + // Define a unique blob name for each image + string blobName = $"image-{imageIndex}.jpg"; + + // Upload the image stream to Azure Blob Storage (stub writes to local file) + BlobClient blobClient = containerClient.GetBlobClient(blobName); + blobClient.Upload(imageStream, overwrite: true); + } + + Console.WriteLine($"Uploaded image {imageIndex}"); + imageIndex++; + } + } + + Console.WriteLine("All images extracted and uploaded successfully."); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error: {ex.Message}"); + } + } +} diff --git a/facades-extract-images-and-text/extract-pdf-images-to-jpeg-quality-85.cs b/facades-extract-images-and-text/extract-pdf-images-to-jpeg-quality-85.cs new file mode 100644 index 00000000..dccc810e --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-to-jpeg-quality-85.cs @@ -0,0 +1,56 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; // source PDF + const string outputFolder = "ExtractedImages"; // folder for JPEGs + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"File not found: {inputPdfPath}"); + return; + } + + Directory.CreateDirectory(outputFolder); + + try + { + // PdfConverter implements IDisposable – wrap it in a using block. + using (PdfConverter converter = new PdfConverter()) + { + // Bind the PDF file to the converter. + converter.BindPdf(inputPdfPath); + + // Prepare internal structures for conversion. + converter.DoConvert(); + + int imageIndex = 1; + // Iterate over all images in the PDF. + while (converter.HasNextImage()) + { + // Build the output file name. + string outputFile = Path.Combine(outputFolder, $"image{imageIndex}.jpg"); + + // Export the current image as JPEG with quality = 85. + // ImageFormat is from System.Drawing.Imaging – use fully qualified name. + converter.GetNextImage( + outputFile, + System.Drawing.Imaging.ImageFormat.Jpeg, + 85); // quality (0‑100) + + imageIndex++; + } + } + + Console.WriteLine($"Images extracted to folder: {outputFolder}"); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error: {ex.Message}"); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-images-to-s3-with-sse.cs b/facades-extract-images-and-text/extract-pdf-images-to-s3-with-sse.cs new file mode 100644 index 00000000..2fa17bab --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-images-to-s3-with-sse.cs @@ -0,0 +1,123 @@ +using System; +using System.IO; +using System.Drawing.Imaging; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +// --------------------------------------------------------------------------- +// Minimal stubs for the AWS SDK (Amazon.S3) so the project can compile +// without adding the real NuGet package. In a real application you should +// reference the official AWSSDK.S3 package instead of these stubs. +// --------------------------------------------------------------------------- +namespace Amazon +{ + public class RegionEndpoint + { + public string SystemName { get; private set; } + private RegionEndpoint(string systemName) => SystemName = systemName; + public static RegionEndpoint GetBySystemName(string name) => new RegionEndpoint(name); + } +} + +namespace Amazon.S3 +{ + using System.Threading.Tasks; + + public class AmazonS3Client : IDisposable + { + public AmazonS3Client(Amazon.RegionEndpoint endpoint) { /* no‑op */ } + public void Dispose() { /* no‑op */ } + // In a real client you would have many async methods – they are omitted here. + } + + public enum ServerSideEncryptionMethod + { + AES256, + // other methods omitted for brevity + } + + public class TransferUtilityUploadRequest + { + public string BucketName { get; set; } + public string FilePath { get; set; } + public string Key { get; set; } + public ServerSideEncryptionMethod ServerSideEncryptionMethod { get; set; } + } + + public class TransferUtility : IDisposable + { + private readonly AmazonS3Client _client; + public TransferUtility(AmazonS3Client client) => _client = client; + public void Upload(TransferUtilityUploadRequest request) + { + // Stub implementation – just write a line to the console. + Console.WriteLine($"[Stub] Uploading '{request.FilePath}' to s3://{request.BucketName}/{request.Key} with SSE={request.ServerSideEncryptionMethod}"); + } + public void Dispose() { /* no‑op */ } + } +} + +// --------------------------------------------------------------------------- +// Actual program logic – unchanged apart from using the stubbed SDK. +// --------------------------------------------------------------------------- +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; // Path to source PDF + const string bucketName = "my-s3-bucket"; // Target S3 bucket + const string awsRegion = "us-east-1"; // AWS region (adjust as needed) + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {inputPdfPath}"); + return; + } + + // Initialize AWS S3 client (credentials are taken from the default provider chain) + using (var s3Client = new Amazon.S3.AmazonS3Client(Amazon.RegionEndpoint.GetBySystemName(awsRegion))) + { + // Load the PDF document + using (var pdfDoc = new Document(inputPdfPath)) + { + // Set up the extractor + var extractor = new PdfExtractor(); + extractor.BindPdf(pdfDoc); + extractor.ExtractImage(); + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Create a temporary file name with PNG extension + string tempImagePath = Path.Combine(Path.GetTempPath(), $"pdf_image_{imageIndex}.png"); + + // Save the next extracted image to the temporary file + extractor.GetNextImage(tempImagePath, ImageFormat.Png); + + // Prepare the upload request with server‑side encryption (AES‑256) + var uploadRequest = new Amazon.S3.TransferUtilityUploadRequest + { + BucketName = bucketName, + FilePath = tempImagePath, + Key = $"extracted-images/image_{imageIndex}.png", + ServerSideEncryptionMethod = Amazon.S3.ServerSideEncryptionMethod.AES256 + }; + + // Upload the image to S3 (stubbed implementation) + using (var transferUtility = new Amazon.S3.TransferUtility(s3Client)) + { + transferUtility.Upload(uploadRequest); + } + Console.WriteLine($"Uploaded image {imageIndex} to s3://{bucketName}/{uploadRequest.Key}"); + + // Clean up the temporary file + try { File.Delete(tempImagePath); } catch { /* ignore cleanup errors */ } + + imageIndex++; + } + } + } + + Console.WriteLine("Image extraction and upload completed."); + } +} diff --git a/facades-extract-images-and-text/extract-pdf-pages-as-png-thumbnails.cs b/facades-extract-images-and-text/extract-pdf-pages-as-png-thumbnails.cs new file mode 100644 index 00000000..4667fc5e --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-pages-as-png-thumbnails.cs @@ -0,0 +1,43 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; +using System.Drawing.Imaging; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputFolder = "Thumbnails"; + + // Verify input file exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdf}"); + return; + } + + // Ensure output directory exists + Directory.CreateDirectory(outputFolder); + + // Initialize the converter facade + PdfConverter converter = new PdfConverter(); + converter.BindPdf(inputPdf); + converter.DoConvert(); + + int imageIndex = 1; + // Extract each page as a PNG thumbnail with max 200x200 pixels + while (converter.HasNextImage()) + { + string outputPath = Path.Combine(outputFolder, $"thumb_{imageIndex}.png"); + // Overload: GetNextImage(string outputFile, ImageFormat format, int width, int height) + converter.GetNextImage(outputPath, ImageFormat.Png, 200, 200); + imageIndex++; + } + + // Release resources + converter.Close(); + + Console.WriteLine($"Thumbnails saved to '{outputFolder}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-text-azure-function.cs b/facades-extract-images-and-text/extract-pdf-text-azure-function.cs new file mode 100644 index 00000000..03953f92 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-azure-function.cs @@ -0,0 +1,175 @@ +using System; +using System.IO; +using System.Text; +using System.Threading.Tasks; +using Azure.Storage.Blobs; +using Aspose.Pdf.Facades; +using Microsoft.Azure.WebJobs; +using Microsoft.Extensions.Logging; + +// ---------- Stubs for missing Azure Functions and Storage SDK ---------- +namespace Microsoft.Azure.WebJobs +{ + [AttributeUsage(AttributeTargets.Method)] + public sealed class FunctionNameAttribute : Attribute + { + public FunctionNameAttribute(string name) { } + } + + [AttributeUsage(AttributeTargets.Parameter)] + public sealed class QueueTriggerAttribute : Attribute + { + public QueueTriggerAttribute(string queueName) { } + public string Connection { get; set; } + } +} + +namespace Microsoft.Extensions.Logging +{ + public interface ILogger + { + void LogError(string message); + void LogInformation(string message); + } + + // Simple console logger used when the real logger is not available. + public class ConsoleLogger : ILogger + { + public void LogError(string message) => Console.Error.WriteLine($"ERROR: {message}"); + public void LogInformation(string message) => Console.WriteLine($"INFO: {message}"); + } +} + +namespace Azure.Storage.Blobs +{ + public class BlobServiceClient + { + private readonly string _connectionString; + public BlobServiceClient(string connectionString) => _connectionString = connectionString; + public BlobContainerClient GetBlobContainerClient(string containerName) => new BlobContainerClient(containerName); + } + + public class BlobContainerClient + { + private readonly string _containerName; + public BlobContainerClient(string containerName) => _containerName = containerName; + public BlobClient GetBlobClient(string blobName) => new BlobClient(_containerName, blobName); + } + + public class BlobClient + { + private readonly string _containerName; + private readonly string _blobName; + public BlobClient(string containerName, string blobName) + { + _containerName = containerName; + _blobName = blobName; + } + + // In a real implementation these would talk to Azure Storage. + // Here they are simple in‑memory / file‑system placeholders. + public Task ExistsAsync() => Task.FromResult(File.Exists(GetLocalPath())); + public Task DownloadToAsync(Stream destination) + { + var path = GetLocalPath(); + if (!File.Exists(path)) throw new FileNotFoundException($"Blob '{_blobName}' not found in container '{_containerName}'."); + using var source = File.OpenRead(path); + return source.CopyToAsync(destination); + } + public Task UploadAsync(Stream source, bool overwrite = false) + { + var path = GetLocalPath(); + var directory = Path.GetDirectoryName(path); + if (!Directory.Exists(directory)) Directory.CreateDirectory(directory); + using var file = new FileStream(path, overwrite ? FileMode.Create : FileMode.CreateNew); + return source.CopyToAsync(file); + } + private string GetLocalPath() => Path.Combine("LocalStorage", _containerName, _blobName); + } +} + +// ---------------------------------------------------------------------- + +public static class PdfQueueProcessor +{ + // Azure Function triggered by a message in the "pdfqueue" storage queue. + // The message is expected to contain the name of the PDF blob to process. + [FunctionName("PdfQueueProcessor")] + public static async Task Run( + [QueueTrigger("pdfqueue", Connection = "AzureWebJobsStorage")] string blobName, + ILogger log) + { + const string pdfContainerName = "pdfcontainer"; + const string textContainerName = "textcontainer"; + + // Connection string for Azure Storage (set in application settings). + string storageConnection = Environment.GetEnvironmentVariable("AzureWebJobsStorage"); + if (string.IsNullOrWhiteSpace(storageConnection)) + { + log.LogError("AzureWebJobsStorage connection string is missing."); + return; + } + + // Initialize Blob service client. + BlobServiceClient blobService = new BlobServiceClient(storageConnection); + BlobContainerClient pdfContainer = blobService.GetBlobContainerClient(pdfContainerName); + BlobContainerClient textContainer = blobService.GetBlobContainerClient(textContainerName); + + // Get reference to the PDF blob. + BlobClient pdfBlob = pdfContainer.GetBlobClient(blobName); + if (!await pdfBlob.ExistsAsync()) + { + log.LogError($"PDF blob '{blobName}' not found in container '{pdfContainerName}'."); + return; + } + + // Download PDF content into a memory stream. + using (MemoryStream pdfStream = new MemoryStream()) + { + await pdfBlob.DownloadToAsync(pdfStream); + pdfStream.Position = 0; // Reset stream position for reading. + + // Extract text using Aspose.Pdf.Facades.PdfExtractor. + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF stream to the extractor. + extractor.BindPdf(pdfStream); + + // Perform text extraction (Unicode encoding is default). + extractor.ExtractText(); + + // Store extracted text into a memory stream. + using (MemoryStream textStream = new MemoryStream()) + { + extractor.GetText(textStream); + textStream.Position = 0; // Reset for upload. + + // Prepare the output blob name (same as input but with .txt extension). + string textBlobName = Path.ChangeExtension(blobName, ".txt"); + BlobClient textBlob = textContainer.GetBlobClient(textBlobName); + + // Upload the extracted text. + await textBlob.UploadAsync(textStream, overwrite: true); + log.LogInformation($"Extracted text from '{blobName}' and uploaded as '{textBlobName}'."); + } + } + } + } + + // ------------------------------------------------------------------ + // Helper Main method to allow local testing without the Azure Functions runtime. + // This is optional and can be removed when the function is deployed. + // ------------------------------------------------------------------ + public static async Task Main(string[] args) + { + // Use the console logger when running locally. + ILogger logger = new Microsoft.Extensions.Logging.ConsoleLogger(); + // Expect a single argument: the blob name to process. + if (args.Length == 0) + { + logger.LogError("Please provide the PDF blob name as the first argument."); + return; + } + await Run(args[0], logger); + } +} diff --git a/facades-extract-images-and-text/extract-pdf-text-page-by-page-progress.cs b/facades-extract-images-and-text/extract-pdf-text-page-by-page-progress.cs new file mode 100644 index 00000000..21d3906b --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-page-by-page-progress.cs @@ -0,0 +1,58 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; // source PDF + const string outputDir = "ExtractedPages"; // folder for per‑page text files + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + // Ensure output folder exists + Directory.CreateDirectory(outputDir); + + // Use PdfExtractor (facade) to extract text page by page + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdf); + + // Extract text for the whole document (Unicode encoding) + extractor.ExtractText(Encoding.Unicode); + + // Determine total pages to process + int totalPages = extractor.Document.Pages.Count; + extractor.StartPage = 1; + extractor.EndPage = totalPages; + + int currentPage = 1; + + // Loop while there is more page text available + while (extractor.HasNextPageText()) + { + // Build output file name for the current page + string outPath = Path.Combine(outputDir, $"page_{currentPage}.txt"); + + // Save the current page's text to the file + extractor.GetNextPageText(outPath); + + // Calculate and display progress percentage + double percent = (double)currentPage / totalPages * 100; + Console.Write($"\rProcessing page {currentPage}/{totalPages} ({percent:0.0}%)"); + + currentPage++; + } + + Console.WriteLine("\nExtraction completed."); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-text-page-by-page.cs b/facades-extract-images-and-text/extract-pdf-text-page-by-page.cs new file mode 100644 index 00000000..139b184c --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-page-by-page.cs @@ -0,0 +1,48 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; // Facade API for extraction + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; // source PDF file + const string outputDir = "ExtractedPages"; // folder for per‑page text files + + // Verify input file exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure output directory exists + Directory.CreateDirectory(outputDir); + + // Use PdfExtractor (facade) to extract text page by page + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF document to the extractor + extractor.BindPdf(inputPdf); + + // Prepare the extractor to work with text + extractor.ExtractText(); // Unicode extraction by default + + int pageNumber = 1; + // Loop while there is more page text available + while (extractor.HasNextPageText()) + { + // Build output file name for the current page + string pageFile = Path.Combine(outputDir, $"Page_{pageNumber}.txt"); + + // Save the current page's text to the file + extractor.GetNextPageText(pageFile); + + Console.WriteLine($"Extracted page {pageNumber} to '{pageFile}'"); + pageNumber++; + } + } + + Console.WriteLine("Text extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-text-to-azure-table.cs b/facades-extract-images-and-text/extract-pdf-text-to-azure-table.cs new file mode 100644 index 00000000..b7e22385 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-to-azure-table.cs @@ -0,0 +1,119 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +// --------------------------------------------------------------------------- +// Minimal stubs for Azure.Data.Tables (used when the Azure SDK package is not +// referenced). These provide just enough functionality for the sample to +// compile and run in a test environment. In a real project you should add the +// NuGet package "Azure.Data.Tables" instead of using these stubs. +// --------------------------------------------------------------------------- +namespace Azure.Data.Tables +{ + using System.Collections.Generic; + + public class TableEntity + { + public string PartitionKey { get; set; } + public string RowKey { get; set; } + private readonly Dictionary _properties = new Dictionary(StringComparer.OrdinalIgnoreCase); + + public TableEntity(string partitionKey, string rowKey) + { + PartitionKey = partitionKey; + RowKey = rowKey; + } + + // Indexer used in the sample: entity["Content"] = extractedText; + public object this[string key] + { + get => _properties.TryGetValue(key, out var value) ? value : null; + set => _properties[key] = value; + } + } + + public class TableClient + { + private readonly string _connectionString; + private readonly string _tableName; + // Simple in‑memory store to emulate Azure Table storage for demo purposes. + private static readonly Dictionary<(string Table, string PartitionKey, string RowKey), TableEntity> _store + = new Dictionary<(string, string, string), TableEntity>(); + + public TableClient(string connectionString, string tableName) + { + _connectionString = connectionString; + _tableName = tableName; + } + + public void CreateIfNotExists() + { + // No‑op for the stub – in real SDK this would create the table if missing. + } + + public void UpsertEntity(TableEntity entity) + { + var key = (_tableName, entity.PartitionKey, entity.RowKey); + _store[key] = entity; // Insert or replace. + } + } +} + +class Program +{ + static void Main() + { + // Path to the source PDF + const string pdfPath = "input.pdf"; + + // Azure Table storage connection details (dummy values for the stub) + const string storageConnectionString = "DefaultEndpointsProtocol=https;AccountName=youraccount;AccountKey=yourkey;EndpointSuffix=core.windows.net"; + const string tableName = "PdfTexts"; + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF not found: {pdfPath}"); + return; + } + + // ------------------------------------------------- + // Extract text from the PDF using PdfExtractor (Facades API) + // ------------------------------------------------- + string extractedText; + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Extract text using Unicode encoding (default) + extractor.ExtractText(); + + // Retrieve the extracted text into a memory stream + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); + extractedText = Encoding.UTF8.GetString(ms.ToArray()); + } + } + + // ------------------------------------------------- + // Store the extracted text in Azure Table storage + // ------------------------------------------------- + // Create a TableClient for the target table + var tableClient = new Azure.Data.Tables.TableClient(storageConnectionString, tableName); + tableClient.CreateIfNotExists(); + + // Use the PDF file name (without extension) as the PartitionKey (document ID) + string documentId = Path.GetFileNameWithoutExtension(pdfPath); + + // RowKey must be unique; using a GUID ensures uniqueness + var entity = new Azure.Data.Tables.TableEntity(documentId, Guid.NewGuid().ToString()); + entity["Content"] = extractedText; // store the extracted text + + // Insert or replace the entity in the table + tableClient.UpsertEntity(entity); + + Console.WriteLine($"Extracted text stored in Azure Table '{tableName}' with PartitionKey='{documentId}'."); + } +} diff --git a/facades-extract-images-and-text/extract-pdf-text-to-gzip.cs b/facades-extract-images-and-text/extract-pdf-text-to-gzip.cs new file mode 100644 index 00000000..6d7b0920 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-to-gzip.cs @@ -0,0 +1,40 @@ +using System; +using System.IO; +using System.IO.Compression; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; // source PDF + const string outputGzipPath = "output.txt.gz"; // compressed text file + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input file not found: {inputPdfPath}"); + return; + } + + // PdfExtractor is a Facade; use using for deterministic disposal + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF document to the extractor + extractor.BindPdf(inputPdfPath); + + // Extract all text using Unicode encoding (default) + extractor.ExtractText(); + + // Open the output file stream + using (FileStream fileStream = new FileStream(outputGzipPath, FileMode.Create, FileAccess.Write)) + // Wrap the file stream with GZipStream for compression + using (GZipStream gzipStream = new GZipStream(fileStream, CompressionLevel.Optimal)) + { + // Write the extracted text directly into the compressed stream + extractor.GetText(gzipStream); + } + } + + Console.WriteLine($"Text extracted and compressed to '{outputGzipPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-text-to-postgres.cs b/facades-extract-images-and-text/extract-pdf-text-to-postgres.cs new file mode 100644 index 00000000..fd187d99 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-to-postgres.cs @@ -0,0 +1,118 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; // PdfExtractor resides here + +// --------------------------------------------------------------------------- +// Minimal stub implementation for Npgsql when the real NuGet package is not +// referenced. This allows the sample to compile and run (without actual DB +// connectivity) in environments where Npgsql is unavailable. +// --------------------------------------------------------------------------- +namespace Npgsql +{ + // Simple stub for NpgsqlConnection + public class NpgsqlConnection : IDisposable + { + private readonly string _connectionString; + public NpgsqlConnection(string connectionString) => _connectionString = connectionString; + public void Open() => Console.WriteLine($"[Stub] Opened connection: {_connectionString}"); + public void Close() => Console.WriteLine("[Stub] Closed connection"); + public void Dispose() => Close(); + } + + // Simple stub for NpgsqlCommand + public class NpgsqlCommand : IDisposable + { + private readonly string _commandText; + private readonly NpgsqlConnection _connection; + private readonly NpgsqlParameterCollection _parameters = new NpgsqlParameterCollection(); + public NpgsqlCommand(string commandText, NpgsqlConnection connection) + { + _commandText = commandText; + _connection = connection; + } + public NpgsqlParameterCollection Parameters => _parameters; + public int ExecuteNonQuery() + { + Console.WriteLine("[Stub] Executing SQL: " + _commandText); + foreach (var p in _parameters) + Console.WriteLine($" Parameter {p.Name} = {p.Value}"); + // Return a dummy affected‑rows count + return 1; + } + public void Dispose() { /* no resources to free */ } + } + + // Simple stub for a parameter collection + public class NpgsqlParameterCollection : System.Collections.Generic.List + { + public void AddWithValue(string name, object value) => Add(new NpgsqlParameter(name, value)); + } + + // Simple stub for a parameter object + public class NpgsqlParameter + { + public string Name { get; } + public object Value { get; } + public NpgsqlParameter(string name, object value) + { + Name = name; + Value = value; + } + } +} + +class PdfTextToPostgres +{ + static void Main() + { + // Path to the source PDF file + const string pdfPath = "input.pdf"; + + // PostgreSQL connection string – adjust host, credentials, and database as needed + const string connectionString = "Host=localhost;Username=postgres;Password=secret;Database=mydb"; + + // Ensure the PDF file exists before processing + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"PDF file not found: {pdfPath}"); + return; + } + + // Extract text from the PDF using Aspose.Pdf.Facades.PdfExtractor + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Perform text extraction (Unicode encoding is default) + extractor.ExtractText(); + + // Capture the extracted text into a memory stream + using (MemoryStream textStream = new MemoryStream()) + { + extractor.GetText(textStream); // Writes extracted text to the stream + string extractedText = Encoding.UTF8.GetString(textStream.ToArray()); + + // Insert the extracted text into PostgreSQL (or the stub above) + using (Npgsql.NpgsqlConnection connection = new Npgsql.NpgsqlConnection(connectionString)) + { + connection.Open(); + + // Example table: pdf_texts(id SERIAL PRIMARY KEY, file_name TEXT, content TEXT) + const string insertSql = "INSERT INTO pdf_texts (file_name, content) VALUES (@file, @content)"; + + using (Npgsql.NpgsqlCommand command = new Npgsql.NpgsqlCommand(insertSql, connection)) + { + command.Parameters.AddWithValue("file", Path.GetFileName(pdfPath)); + command.Parameters.AddWithValue("content", extractedText); + command.ExecuteNonQuery(); + } + } + + Console.WriteLine("Text extraction and database insertion completed successfully."); + } + // No need to call extractor.Close(); the using block disposes it. + } + } +} diff --git a/facades-extract-images-and-text/extract-pdf-text-to-stringbuilder.cs b/facades-extract-images-and-text/extract-pdf-text-to-stringbuilder.cs new file mode 100644 index 00000000..139ffec1 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-to-stringbuilder.cs @@ -0,0 +1,55 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; + const string outputTextPath = "extracted.txt"; + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"File not found: {inputPdfPath}"); + return; + } + + // StringBuilder to hold extracted text for further manipulation + StringBuilder sb = new StringBuilder(); + + // Use PdfExtractor (Facade) to extract text from the PDF + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdfPath); + + // Extract all text using Unicode encoding (default) + extractor.ExtractText(); + + // Save extracted text to a memory stream + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); + ms.Position = 0; // reset stream position + + // Convert stream bytes to string (Unicode) + string extracted = Encoding.Unicode.GetString(ms.ToArray()); + + // Append to StringBuilder for any further processing + sb.Append(extracted); + } + } + + // Example manipulation: trim and normalize line endings + string processed = sb.ToString() + .Trim() + .Replace("\r\n", "\n"); + + // Write the final text to disk using Unicode encoding + File.WriteAllText(outputTextPath, processed, Encoding.Unicode); + + Console.WriteLine($"Text extracted and saved to '{outputTextPath}'."); + } +} diff --git a/facades-extract-images-and-text/extract-pdf-text-to-stringwriter.cs b/facades-extract-images-and-text/extract-pdf-text-to-stringwriter.cs new file mode 100644 index 00000000..f48dced2 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-to-stringwriter.cs @@ -0,0 +1,49 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string pdfPath = "input.pdf"; + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"File not found: {pdfPath}"); + return; + } + + // StringWriter will hold the extracted text for logging + using (StringWriter logWriter = new StringWriter()) + { + // PdfExtractor implements IDisposable, so wrap it in a using block + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Extract text using Unicode encoding + extractor.ExtractText(Encoding.Unicode); + + // Capture the extracted text into a MemoryStream + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); + ms.Position = 0; // Reset stream position for reading + + // Read the stream as a Unicode string + using (StreamReader reader = new StreamReader(ms, Encoding.Unicode)) + { + string extractedText = reader.ReadToEnd(); + logWriter.Write(extractedText); + } + } + } + + // Example output: write the captured text to the console (or pass to a logging framework) + Console.WriteLine(logWriter.ToString()); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-text-to-temp-file-verify.cs b/facades-extract-images-and-text/extract-pdf-text-to-temp-file-verify.cs new file mode 100644 index 00000000..e0c9621c --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-to-temp-file-verify.cs @@ -0,0 +1,56 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + // Input PDF file + const string inputPdf = "input.pdf"; + + // Temporary text file to store extracted content + string tempTxt = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString() + ".txt"); + + // Ensure the input file exists + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input PDF not found: {inputPdf}"); + return; + } + + // Extract text using PdfExtractor + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF document + extractor.BindPdf(inputPdf); + + // Perform text extraction (Unicode encoding by default) + extractor.ExtractText(); + + // Capture extracted text into a memory stream + string extractedText; + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); // Write text to stream + extractedText = Encoding.Unicode.GetString(ms.ToArray()); + } + + // Save the same extracted text to a temporary file using the same encoding + File.WriteAllText(tempTxt, extractedText, Encoding.Unicode); + + // Read back the text from the temporary file using the same encoding + string fileContent = File.ReadAllText(tempTxt, Encoding.Unicode); + + // Verify that the in‑memory extraction matches the file content + bool isMatch = string.Equals(extractedText, fileContent, StringComparison.Ordinal); + Console.WriteLine(isMatch + ? "Verification succeeded: extracted text matches file content." + : "Verification failed: mismatch between extracted text and file content."); + } + + // Clean up the temporary file + try { File.Delete(tempTxt); } catch { /* ignore cleanup errors */ } + } +} diff --git a/facades-extract-images-and-text/extract-pdf-text-to-utf8-file.cs b/facades-extract-images-and-text/extract-pdf-text-to-utf8-file.cs new file mode 100644 index 00000000..e4c5a0c4 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-to-utf8-file.cs @@ -0,0 +1,42 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; // source PDF + const string outputTxtPath = "extracted.txt"; // destination text file + + // Ensure the input file exists + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"File not found: {inputPdfPath}"); + return; + } + + // Use PdfExtractor (Facade) to extract text with UTF‑8 encoding + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF document to the extractor + extractor.BindPdf(inputPdfPath); + + // Extract text using UTF‑8 (Unicode) encoding + extractor.ExtractText(Encoding.UTF8); + + // Retrieve the extracted text into a memory stream + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); // write text to stream + string extractedText = Encoding.UTF8.GetString(ms.ToArray()); + + // Save the text to a file with UTF‑8 encoding + File.WriteAllText(outputTxtPath, extractedText, Encoding.UTF8); + } + } + + Console.WriteLine($"Text extracted to '{outputTxtPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-pdf-text-with-cancellation.cs b/facades-extract-images-and-text/extract-pdf-text-with-cancellation.cs new file mode 100644 index 00000000..9b56ae92 --- /dev/null +++ b/facades-extract-images-and-text/extract-pdf-text-with-cancellation.cs @@ -0,0 +1,88 @@ +using System; +using System.IO; +using System.Threading; +using Aspose.Pdf.Facades; + +class PdfExtractionWithCancellation +{ + // Extracts text from a PDF file page‑by‑page. + // The operation can be cancelled via the provided CancellationToken. + public static void ExtractText(string pdfPath, string outputPath, CancellationToken cancellationToken) + { + // Validate input. + if (!File.Exists(pdfPath)) + throw new FileNotFoundException($"Input PDF not found: {pdfPath}"); + + // Ensure the output directory exists. + string outDir = Path.GetDirectoryName(outputPath); + if (!string.IsNullOrEmpty(outDir) && !Directory.Exists(outDir)) + Directory.CreateDirectory(outDir); + + // Use the PdfExtractor facade inside a using block for deterministic disposal. + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF file. + extractor.BindPdf(pdfPath); + + // Configure the page range to cover the whole document. + extractor.StartPage = 1; + extractor.EndPage = extractor.Document.Pages.Count; // Document property is available after binding. + + // Initiate the extraction process. + extractor.ExtractText(); + + // Open a stream for the final output text file. + using (FileStream outStream = new FileStream(outputPath, FileMode.Create, FileAccess.Write)) + { + // Loop while there are more pages and cancellation has not been requested. + while (extractor.HasNextPageText()) + { + // Abort if the caller signaled cancellation. + if (cancellationToken.IsCancellationRequested) + { + // Clean up the extractor and exit the loop. + extractor.Close(); + Console.WriteLine("Extraction cancelled by user."); + break; + } + + // Retrieve the next page's text into the output stream. + // The overload with (Stream) writes the page text directly. + extractor.GetNextPageText(outStream); + } + } + + // If the operation completed without cancellation, ensure the extractor is closed. + if (!cancellationToken.IsCancellationRequested) + extractor.Close(); + } + } + + // Example usage. + static void Main() + { + string inputPdf = "sample.pdf"; + string outputTxt = "sample_extracted.txt"; + + // Create a CancellationTokenSource that can be triggered by the user (e.g., after 5 seconds). + using (CancellationTokenSource cts = new CancellationTokenSource()) + { + // For demonstration, cancel after 5 seconds. + cts.CancelAfter(TimeSpan.FromSeconds(5)); + + try + { + ExtractText(inputPdf, outputTxt, cts.Token); + Console.WriteLine($"Extraction finished. Output saved to '{outputTxt}'."); + } + catch (OperationCanceledException) + { + Console.WriteLine("Operation was cancelled."); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error: {ex.Message}"); + } + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-resource-defined-images.cs b/facades-extract-images-and-text/extract-resource-defined-images.cs new file mode 100644 index 00000000..0ed8d4f8 --- /dev/null +++ b/facades-extract-images-and-text/extract-resource-defined-images.cs @@ -0,0 +1,46 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; +using Aspose.Pdf; +using System.Drawing.Imaging; + +class Program +{ + static void Main() + { + const string inputPdf = "sample.pdf"; + const string outputDir = "ExtractedImages"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputDir); + + // Create a PdfExtractor instance and bind the PDF file + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); + + // Explicitly set the extraction mode to DefinedInResources + extractor.ExtractImageMode = ExtractImageMode.DefinedInResources; + + // Extract images from the document + extractor.ExtractImage(); + + int imageIndex = 1; + // Retrieve each extracted image and save it as PNG + while (extractor.HasNextImage()) + { + string outPath = Path.Combine(outputDir, $"image_{imageIndex}.png"); + extractor.GetNextImage(outPath, ImageFormat.Png); + imageIndex++; + } + } + + Console.WriteLine("Image extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-text-and-images-from-pdf.cs b/facades-extract-images-and-text/extract-text-and-images-from-pdf.cs new file mode 100644 index 00000000..9be3095f --- /dev/null +++ b/facades-extract-images-and-text/extract-text-and-images-from-pdf.cs @@ -0,0 +1,60 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; +using System.Drawing.Imaging; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; + const string outputTextPath = "extracted_text.txt"; + const string outputImagesFolder = "ExtractedImages"; + + // Verify input file exists + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input file not found: {inputPdfPath}"); + return; + } + + // Ensure the folder for images exists + Directory.CreateDirectory(outputImagesFolder); + + // PdfExtractor implements IDisposable, so use a using block + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF document to the extractor + extractor.BindPdf(inputPdfPath); + + // Enable text extraction (0 = pure text mode, 1 = raw ordering mode) + extractor.ExtractTextMode = 0; + + // Enable image extraction. + // Default mode (ExtractImageMode.DefinedInResources) extracts all images. + // Uncomment the next line to extract only actually used images. + // extractor.ExtractImageMode = ExtractImageMode.ActuallyUsed; + + // Perform the extraction operations + extractor.ExtractText(); + extractor.ExtractImage(); + + // Save the extracted text to a single file + extractor.GetText(outputTextPath); + + // Save each extracted image as a PNG file + int imageIndex = 1; + while (extractor.HasNextImage()) + { + string imagePath = Path.Combine(outputImagesFolder, $"image_{imageIndex}.png"); + extractor.GetNextImage(imagePath, ImageFormat.Png); + imageIndex++; + } + + // Explicitly close the extractor (optional, as using will dispose) + extractor.Close(); + } + + Console.WriteLine("Text and images have been extracted successfully."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-text-from-password-protected-pdf.cs b/facades-extract-images-and-text/extract-text-from-password-protected-pdf.cs new file mode 100644 index 00000000..0f1dbdc3 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-password-protected-pdf.cs @@ -0,0 +1,30 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "protected.pdf"; + const string outputTxt = "extracted.txt"; + const string ownerPassword = "ownerpass"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + // PdfExtractor handles decryption when the correct password is supplied. + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.Password = ownerPassword; // Owner password for the protected PDF + extractor.BindPdf(inputPdf); // Load the PDF file + extractor.ExtractText(); // Perform text extraction + extractor.GetText(outputTxt); // Save extracted text to a file + } + + Console.WriteLine($"Text successfully extracted to '{outputTxt}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-text-from-pdf-and-encrypt-aes.cs b/facades-extract-images-and-text/extract-text-from-pdf-and-encrypt-aes.cs new file mode 100644 index 00000000..899b902d --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdf-and-encrypt-aes.cs @@ -0,0 +1,94 @@ +using System; +using System.IO; +using System.Security.Cryptography; +using System.Text; +using Aspose.Pdf.Facades; // PdfExtractor resides here + +class Program +{ + static void Main() + { + const string pdfPath = "input.pdf"; // source PDF + const string txtPath = "extracted.txt"; // intermediate plain text + const string encryptedPath = "extracted.enc"; // AES‑encrypted output + const string password = "StrongPassword123"; // password for encryption + + // Verify that the source PDF exists before proceeding + if (!File.Exists(pdfPath)) + { + Console.WriteLine($"Source PDF '{pdfPath}' not found. Operation aborted."); + return; + } + + // ----------------------------------------------------------------- + // 1. Extract all text from the PDF using Aspose.Pdf.Facades.PdfExtractor + // ----------------------------------------------------------------- + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Extract text using Unicode encoding (default) + extractor.ExtractText(); + + // Save the extracted text to a temporary .txt file + extractor.GetText(txtPath); + } + + // --------------------------------------------------------------- + // 2. Encrypt the resulting .txt file with AES (CBC, 256‑bit key) + // --------------------------------------------------------------- + // Derive a 256‑bit key and a 128‑bit IV from the password using the + // recommended static Pbkdf2 method (Rfc2898DeriveBytes constructors are obsolete). + byte[] salt = GenerateRandomBytes(16); // random salt stored with the ciphertext + const int iterations = 100_000; + byte[] key = Rfc2898DeriveBytes.Pbkdf2(password, salt, iterations, HashAlgorithmName.SHA256, 32); // 256‑bit key + byte[] iv = Rfc2898DeriveBytes.Pbkdf2(password, salt, iterations, HashAlgorithmName.SHA256, 16); // 128‑bit IV + + // Read the plain‑text bytes + byte[] plainBytes = File.ReadAllBytes(txtPath); + + // Perform AES encryption + byte[] cipherBytes; + using (Aes aes = Aes.Create()) + { + aes.KeySize = 256; + aes.BlockSize = 128; + aes.Mode = CipherMode.CBC; + aes.Padding = PaddingMode.PKCS7; + aes.Key = key; + aes.IV = iv; + + using (ICryptoTransform encryptor = aes.CreateEncryptor()) + using (MemoryStream msCipher = new MemoryStream()) + { + // Prepend the salt so it can be used during decryption + msCipher.Write(salt, 0, salt.Length); + using (CryptoStream cs = new CryptoStream(msCipher, encryptor, CryptoStreamMode.Write)) + { + cs.Write(plainBytes, 0, plainBytes.Length); + } + cipherBytes = msCipher.ToArray(); + } + } + + // Write the encrypted data to the output file + File.WriteAllBytes(encryptedPath, cipherBytes); + + // Optional: delete the intermediate plain‑text file for security + try { File.Delete(txtPath); } catch { /* ignore any error */ } + + Console.WriteLine($"Text extracted from '{pdfPath}' and encrypted to '{encryptedPath}'."); + } + + // Helper: generate cryptographically strong random bytes + private static byte[] GenerateRandomBytes(int count) + { + byte[] bytes = new byte[count]; + using (RandomNumberGenerator rng = RandomNumberGenerator.Create()) + { + rng.GetBytes(bytes); + } + return bytes; + } +} diff --git a/facades-extract-images-and-text/extract-text-from-pdf-byte-array.cs b/facades-extract-images-and-text/extract-text-from-pdf-byte-array.cs new file mode 100644 index 00000000..11e6f1c1 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdf-byte-array.cs @@ -0,0 +1,49 @@ +using System; +using System.IO; +using System.Text; +using System.Text.Json; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main(string[] args) + { + // Expect a single argument: path to the PDF file to process + if (args.Length == 0) + { + Console.Error.WriteLine("Usage: dotnet run "); + return; + } + + string pdfPath = args[0]; + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"File not found: {pdfPath}"); + return; + } + + byte[] pdfBytes = File.ReadAllBytes(pdfPath); + string extractedText = ExtractTextFromPdf(pdfBytes); + + // Return the extracted text as a JSON object on stdout + var result = new { text = extractedText }; + string json = JsonSerializer.Serialize(result); + Console.WriteLine(json); + } + + private static string ExtractTextFromPdf(byte[] pdfBytes) + { + if (pdfBytes == null || pdfBytes.Length == 0) + return string.Empty; + + using var pdfStream = new MemoryStream(pdfBytes); + using var extractor = new PdfExtractor(); + extractor.BindPdf(pdfStream); // bind the PDF from the stream + extractor.ExtractText(); // perform text extraction (pure mode) + + using var textStream = new MemoryStream(); + extractor.GetText(textStream); + // Aspose returns UTF‑16LE (Unicode) bytes + return Encoding.Unicode.GetString(textStream.ToArray()); + } +} diff --git a/facades-extract-images-and-text/extract-text-from-pdf-bytes.cs b/facades-extract-images-and-text/extract-text-from-pdf-bytes.cs new file mode 100644 index 00000000..91fd0019 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdf-bytes.cs @@ -0,0 +1,97 @@ +using System; +using System.IO; +using System.Text; +using System.Runtime.InteropServices; +using Aspose.Pdf; +using Aspose.Pdf.Facades; +using Aspose.Pdf.Text; // Added namespace for TextFragment + +class PdfTextExtractor +{ + // Extracts all text from a PDF supplied as a byte array. + // Returns the extracted text as a string. + public static string ExtractTextFromPdfBytes(byte[] pdfBytes) + { + if (pdfBytes == null || pdfBytes.Length == 0) + throw new ArgumentException("PDF byte array is null or empty.", nameof(pdfBytes)); + + // Bind the PDF from the in‑memory stream – no file I/O. + using (var pdfStream = new MemoryStream(pdfBytes)) + using (var extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfStream); + extractor.ExtractText(); + + // Get the extracted text into another memory stream. + using (var textStream = new MemoryStream()) + { + extractor.GetText(textStream); + // Aspose writes UTF‑8 encoded bytes. + return Encoding.UTF8.GetString(textStream.ToArray()); + } + } + } + + // Helper that walks the inner‑exception chain looking for a DllNotFoundException. + private static bool ContainsDllNotFound(Exception? ex) + { + while (ex != null) + { + if (ex is DllNotFoundException) + return true; + ex = ex.InnerException; + } + return false; + } + + // Returns a minimal PDF (one page with the text "Hello Aspose PDF!") + // encoded as a Base64 string. Used as a fallback on platforms where + // Aspose.Pdf cannot render/sav​e a document because libgdiplus is missing. + private static byte[] GetFallbackPdfBytes() + { + const string base64Pdf = "JVBERi0xLjQKJcfsj6IKMSAwIG9iago8PAovVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIKPj4KZW5kb2JqCjIgMCBvYmoKPDwKL1R5cGUgL1BhZ2VzIC9Db3VudCAxIC9LaWRzIFsgMyAwIFIgXQo+PgplbmRvYmoKMyAwIG9iago8PAovVHlwZSAvUGFnZQovUGFyZW50IDIgMCBSCi9NZWRpYUJveCBbMCAwIDYxMiA3OTJdCi9Db250ZW50cyA0IDAgUgo+PgplbmRvYmoKNCAwIG9iago8PAovTGVuZ3RoIDU2Pj4Kc3RyZWFtCkJUClRleHQgSGVsbG8gQXNwb3NlIFBERiEKRVQKZW5kc3RyZWFtCmVuZG9iagp4cmVmCjAgNQowMDAwMDAwMDAwIDY1NTM1IGYgCjAwMDAwMDAxMTUgMDAwMDAgbiAKMDAwMDAwMDA3OSAwMDAwMCBuIAowMDAwMDAwMTYyIDAwMDAwIG4gCjAwMDAwMDAyMzQgMDAwMDAgbiAKdHJhaWxlcgo8PAovU2l6ZSA1Ci9Sb290IDEgMCBSCj4+CnN0YXJ0eHJlZgowLjAKJSVFT0YK"; + return Convert.FromBase64String(base64Pdf); + } + + // Example usage – creates a PDF completely in memory and extracts its text. + static void Main() + { + byte[] pdfBytes; + // Build a tiny PDF document in memory (no disk access). + using (var doc = new Document()) + { + var page = doc.Pages.Add(); + page.Paragraphs.Add(new TextFragment("Hello Aspose PDF!")); + using (var ms = new MemoryStream()) + { + // Document.Save internally uses GDI+. On non‑Windows platforms the native + // libgdiplus may be missing, causing a TypeInitializationException. + // Guard the call and fall back to a pre‑generated PDF if needed. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + doc.Save(ms); + } + else + { + try + { + doc.Save(ms); + } + catch (TypeInitializationException ex) when (ContainsDllNotFound(ex)) + { + Console.WriteLine("GDI+ (libgdiplus) not available – using fallback PDF bytes."); + pdfBytes = GetFallbackPdfBytes(); + // Skip the rest of the using block – we already have pdfBytes. + goto ExtractAndPrint; + } + } + pdfBytes = ms.ToArray(); + } + } + + ExtractAndPrint: + string text = ExtractTextFromPdfBytes(pdfBytes); + Console.WriteLine("Extracted Text:"); + Console.WriteLine(text); + } +} diff --git a/facades-extract-images-and-text/extract-text-from-pdf-pages-with-progress.cs b/facades-extract-images-and-text/extract-text-from-pdf-pages-with-progress.cs new file mode 100644 index 00000000..4004ba86 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdf-pages-with-progress.cs @@ -0,0 +1,51 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "large.pdf"; + const string outputFolder = "ExtractedPages"; + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Input file not found: {inputPdfPath}"); + return; + } + + // Ensure the output directory exists + Directory.CreateDirectory(outputFolder); + + // Use PdfExtractor inside a using block to guarantee disposal + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(inputPdfPath); + + // Extract text using Unicode encoding + extractor.ExtractText(Encoding.Unicode); + + // Total number of pages in the source PDF + int totalPages = extractor.Document.Pages.Count; + int currentPage = 1; + + // Iterate through each page's text + while (extractor.HasNextPageText()) + { + string outputFile = Path.Combine(outputFolder, $"Page_{currentPage}.txt"); + extractor.GetNextPageText(outputFile); + + // Report progress to the console + Console.WriteLine($"Processed page {currentPage} of {totalPages}"); + + currentPage++; + } + } + + Console.WriteLine("Text extraction completed."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-text-from-pdf-to-utf8-txt.cs b/facades-extract-images-and-text/extract-text-from-pdf-to-utf8-txt.cs new file mode 100644 index 00000000..399cb527 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdf-to-utf8-txt.cs @@ -0,0 +1,41 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; // Path to source PDF + const string outputTxtPath = "output.txt"; // Path for extracted text + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"Error: File not found – {inputPdfPath}"); + return; + } + + try + { + // CREATE – initialize the PdfExtractor facade + using (PdfExtractor extractor = new PdfExtractor()) + { + // LOAD – bind the PDF document to the extractor + extractor.BindPdf(inputPdfPath); + + // EXTRACT – extract text using UTF‑8 encoding + extractor.ExtractText(Encoding.UTF8); + + // SAVE – write the extracted text to a UTF‑8 encoded .txt file + extractor.GetText(outputTxtPath); + } + + Console.WriteLine($"Text extracted successfully to '{outputTxtPath}'."); + } + catch (Exception ex) + { + Console.Error.WriteLine($"Extraction failed: {ex.Message}"); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-text-from-pdf-using-pdfextractor.cs b/facades-extract-images-and-text/extract-text-from-pdf-using-pdfextractor.cs new file mode 100644 index 00000000..bda2083f --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdf-using-pdfextractor.cs @@ -0,0 +1,49 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main(string[] args) + { + // Expect a single argument: path to the PDF file + if (args.Length != 1) + { + Console.Error.WriteLine("Usage: PdfTextExtractor "); + return; + } + + string pdfPath = args[0]; + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"Error: File not found – {pdfPath}"); + return; + } + + // Use PdfExtractor (Facade) to extract text + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Extract text using the default (pure) mode + extractor.ExtractText(); + + // Retrieve the extracted text into a memory stream + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); + // Convert the stream contents (Unicode) to a string + string extractedText = Encoding.Unicode.GetString(ms.ToArray()); + + // Output the text to standard output + Console.Write(extractedText); + } + + // Close the extractor (optional, Dispose will be called by using) + extractor.Close(); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-text-from-pdf.cs b/facades-extract-images-and-text/extract-text-from-pdf.cs new file mode 100644 index 00000000..5389c961 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdf.cs @@ -0,0 +1,47 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main(string[] args) + { + // Expect a single argument: path to the PDF file + if (args.Length != 1) + { + Console.Error.WriteLine("Usage: ExtractText "); + return; + } + + string pdfPath = args[0]; + + if (!File.Exists(pdfPath)) + { + Console.Error.WriteLine($"File not found: {pdfPath}"); + return; + } + + // Use PdfExtractor from Aspose.Pdf.Facades to extract text + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor + extractor.BindPdf(pdfPath); + + // Extract text using default Unicode encoding + extractor.ExtractText(); + + // Retrieve the extracted text into a memory stream + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); + + // Convert the stream bytes to a string (Unicode encoding) + string extractedText = Encoding.Unicode.GetString(ms.ToArray()); + + // Output the text to the console + Console.WriteLine(extractedText); + } + } + } +} diff --git a/facades-extract-images-and-text/extract-text-from-pdfs-using-pdfextractor.cs b/facades-extract-images-and-text/extract-text-from-pdfs-using-pdfextractor.cs new file mode 100644 index 00000000..0455c39d --- /dev/null +++ b/facades-extract-images-and-text/extract-text-from-pdfs-using-pdfextractor.cs @@ -0,0 +1,100 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; + +namespace PdfProcessingService +{ + class Program + { + static void Main(string[] args) + { + // Input and output directories are expected to be mounted as volumes. + // They can be overridden with environment variables. + string inputDirectory = Environment.GetEnvironmentVariable("INPUT_DIR") ?? "/data/input"; + string outputDirectory = Environment.GetEnvironmentVariable("OUTPUT_DIR") ?? "/data/output"; + + // Ensure the input directory exists – create it if the volume is missing. + // If creation fails (read‑only volume), fall back to a temporary folder. + inputDirectory = EnsureReadableDirectory(inputDirectory); + + // Ensure the output directory is writable. + outputDirectory = EnsureWritableDirectory(outputDirectory); + + // Process each PDF file found in the input directory. + foreach (string pdfPath in Directory.GetFiles(inputDirectory, "*.pdf")) + { + // Derive a text file name based on the PDF file name. + string fileNameWithoutExt = Path.GetFileNameWithoutExtension(pdfPath); + string txtPath = Path.Combine(outputDirectory, $"{fileNameWithoutExt}.txt"); + + // Use PdfExtractor (Aspose.Pdf.Facades) to extract text. + // The extractor implements IDisposable via the Facade base class, so we wrap it in a using block. + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the PDF file to the extractor. + extractor.BindPdf(pdfPath); + + // Extract all text from the document using Unicode encoding (default). + extractor.ExtractText(); + + // Save the extracted text to the target .txt file. + extractor.GetText(txtPath); + } + + Console.WriteLine($"Extracted text from '{pdfPath}' to '{txtPath}'."); + } + + Console.WriteLine("PDF processing completed."); + } + + /// + /// Tries to create the requested directory. If creation fails because the path is read‑only, + /// a writable temporary directory is created and returned. + /// + private static string EnsureWritableDirectory(string desiredPath) + { + try + { + Directory.CreateDirectory(desiredPath); + return desiredPath; + } + catch (IOException) + { + // The filesystem is likely read‑only. Use the OS temporary folder instead. + return CreateTempFallback(); + } + catch (UnauthorizedAccessException) + { + // Same fallback for permission issues. + return CreateTempFallback(); + } + } + + /// + /// Tries to ensure the input directory exists. If it cannot be created (read‑only volume), + /// falls back to a temporary folder so the service can still start without crashing. + /// + private static string EnsureReadableDirectory(string desiredPath) + { + try + { + Directory.CreateDirectory(desiredPath); + return desiredPath; + } + catch (Exception ex) when (ex is IOException || ex is UnauthorizedAccessException) + { + string tempPath = CreateTempFallback(); + Console.WriteLine($"Warning: Could not access input directory '{desiredPath}' ({ex.Message}). Falling back to temporary directory '{tempPath}'."); + return tempPath; + } + } + + private static string CreateTempFallback() + { + string tempPath = Path.Combine(Path.GetTempPath(), "pdf_io"); + Directory.CreateDirectory(tempPath); + Console.WriteLine($"Info: Using temporary directory '{tempPath}'."); + return tempPath; + } + } +} diff --git a/facades-extract-images-and-text/extract-text-only-from-pdf.cs b/facades-extract-images-and-text/extract-text-only-from-pdf.cs new file mode 100644 index 00000000..18a09429 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-only-from-pdf.cs @@ -0,0 +1,64 @@ +using System; +using System.IO; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +namespace MyApp +{ + class Program + { + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputTxt = "output.txt"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"File not found: {inputPdf}"); + return; + } + + // Initialize the PdfExtractor facade + PdfExtractor extractor = new PdfExtractor(); + + // Bind the PDF document to the extractor + extractor.BindPdf(inputPdf); + + // Do NOT call ExtractImage(); only extract text. + // extractor.ExtractImage(); // intentionally omitted to ignore images + + // Extract only the text content + extractor.ExtractText(); + + // Save the extracted text to a file + extractor.GetText(outputTxt); + + // Release resources held by the extractor + extractor.Close(); + + Console.WriteLine($"Text extracted to '{outputTxt}' (images ignored)."); + } + } +} + +// Minimal NUnit stubs for compilation when the NUnit package is not referenced +namespace NUnit.Framework +{ + [AttributeUsage(AttributeTargets.Class)] + public sealed class TestFixtureAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class TestAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class OneTimeSetUpAttribute : Attribute { } + + public static class Assert + { + public static void AreEqual(T expected, T actual, string? message = null) + { + if (!object.Equals(expected, actual)) + throw new Exception(message ?? $"Assert.AreEqual failed. Expected:<{expected}>. Actual:<{actual}>."); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/extract-text-to-memorystream-generate-pdf.cs b/facades-extract-images-and-text/extract-text-to-memorystream-generate-pdf.cs new file mode 100644 index 00000000..fb82edd8 --- /dev/null +++ b/facades-extract-images-and-text/extract-text-to-memorystream-generate-pdf.cs @@ -0,0 +1,69 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf; +using Aspose.Pdf.Facades; +using Aspose.Pdf.Text; + +class Program +{ + static void Main() + { + const string inputPdfPath = "input.pdf"; + + if (!File.Exists(inputPdfPath)) + { + Console.Error.WriteLine($"File not found: {inputPdfPath}"); + return; + } + + // Extract text from the PDF into a MemoryStream + using (MemoryStream textStream = new MemoryStream()) + { + using (PdfExtractor extractor = new PdfExtractor()) + { + // Bind the source PDF file + extractor.BindPdf(inputPdfPath); + + // Extract all text using Unicode encoding + extractor.ExtractText(Encoding.Unicode); + + // Write extracted text to the MemoryStream + extractor.GetText(textStream); + + // Reset position for subsequent reading + textStream.Position = 0; + } + + // Pass the MemoryStream to another library that creates a PDF from text + GeneratePdfFromTextStream(textStream, "generated.pdf"); + } + } + + // Example placeholder that consumes a text stream and creates a PDF. + // Replace the body with the actual library call as needed. + static void GeneratePdfFromTextStream(Stream textStream, string outputPdfPath) + { + // Use Aspose.Pdf core API to demonstrate handling the stream. + using (Document doc = new Document()) + { + // Add a new page (1‑based indexing) + Page page = doc.Pages.Add(); + + // Read all text from the stream + using (StreamReader reader = new StreamReader(textStream, Encoding.Unicode, true, 1024, leaveOpen: true)) + { + string allText = reader.ReadToEnd(); + + // Add the text as a TextFragment to the page + TextFragment fragment = new TextFragment(allText); + page.Paragraphs.Add(fragment); + } + + // Save the generated PDF + doc.Save(outputPdfPath); + } + + Console.WriteLine($"Generated PDF saved to '{outputPdfPath}'."); + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/handle-errors-binding-corrupted-pdf.cs b/facades-extract-images-and-text/handle-errors-binding-corrupted-pdf.cs new file mode 100644 index 00000000..c3374ae1 --- /dev/null +++ b/facades-extract-images-and-text/handle-errors-binding-corrupted-pdf.cs @@ -0,0 +1,54 @@ +using System; +using System.IO; +using Aspose.Pdf.Facades; +using Aspose.Pdf; // for exception types + +class Program +{ + static void Main(string[] args) + { + // Path to the PDF file (first argument or default) + string inputPath = args.Length > 0 ? args[0] : "corrupted.pdf"; + + if (!File.Exists(inputPath)) + { + Console.Error.WriteLine($"File not found: {inputPath}"); + return; + } + + // Create a PdfExtractor instance + PdfExtractor extractor = new PdfExtractor(); + + try + { + // Attempt to bind the PDF file; this may throw if the file is corrupted + extractor.BindPdf(inputPath); + Console.WriteLine("PDF bound successfully."); + // Extraction operations can be performed here after successful binding + } + catch (InvalidPdfFileFormatException ex) + { + // Thrown when the PDF file format is invalid or corrupted + Console.Error.WriteLine("Invalid PDF file format:"); + Console.Error.WriteLine(ex.Message); + } + catch (ObjectReferenceCorruptedException ex) + { + // Thrown when an object reference inside the PDF is corrupted + Console.Error.WriteLine("Object reference corrupted in PDF:"); + Console.Error.WriteLine(ex.Message); + } + catch (PdfException ex) + { + // General PDF processing errors + Console.Error.WriteLine("PDF processing error:"); + Console.Error.WriteLine(ex.Message); + } + catch (Exception ex) + { + // Any other unexpected errors + Console.Error.WriteLine("Unexpected error:"); + Console.Error.WriteLine(ex.Message); + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/index.json b/facades-extract-images-and-text/index.json new file mode 100644 index 00000000..434bb8f7 --- /dev/null +++ b/facades-extract-images-and-text/index.json @@ -0,0 +1,1854 @@ +{ + "category": "facades-extract-images-and-text", + "nuget_version": "26.4.0", + "last_updated": "2026-05-08T09:44:37Z", + "examples": { + "extract-pdf-text-to-utf8-file": { + "title": "Extract PDF Text to UTF-8 File using PdfExtractor", + "filename": "extract-pdf-text-to-utf8-file.cs", + "description": "Demonstrates extracting text from a PDF with Aspose.Pdf.Facades.PdfExtractor and saving it to a UTF-8 encoded text file.", + "tags": [ + "pdf", + "text-extraction", + "utf-8", + "facade", + "csharp" + ], + "apis_used": [ + "PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "check-pdf-text-only-by-detecting-images": { + "title": "Check if PDF Is Text‑Only by Detecting Images", + "filename": "check-pdf-text-only-by-detecting-images.cs", + "description": "The example uses Aspose.Pdf.Facades.PdfExtractor to bind a PDF, extract its images, and then checks HasNextImage to determine whether any images are present, indicating the document is not text‑only.", + "tags": [ + "pdf", + "image-extraction", + "text-only", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "retry-pdf-text-extraction": { + "title": "Retry PDF Text Extraction with PdfExtractor", + "filename": "retry-pdf-text-extraction.cs", + "description": "Demonstrates how to extract text from a PDF using Aspose.Pdf.Facades.PdfExtractor with a retry loop that handles IOException and retries up to three times.", + "tags": [ + "pdf", + "text-extraction", + "retry", + "exception-handling", + "aspose-pdf" + ], + "apis_used": [ + "PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText", + "PdfException" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-and-images-from-pdf": { + "title": "Extract Text and Images from PDF Using PdfExtractor", + "filename": "extract-text-and-images-from-pdf.cs", + "description": "Demonstrates how to use Aspose.Pdf's PdfExtractor to extract both text and images from a PDF file in a single run.", + "tags": [ + "pdf", + "text-extraction", + "image-extraction", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.ExtractImage", + "PdfExtractor.GetText", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-to-jpeg-quality-85": { + "title": "Extract PDF Images to JPEG (Quality 85)", + "filename": "extract-pdf-images-to-jpeg-quality-85.cs", + "description": "Demonstrates extracting all images from a PDF and saving them as JPEG files using Aspose.Pdf.Facades.PdfConverter with a JPEG quality setting of 85 to balance file size and clarity.", + "tags": [ + "pdf", + "image-extraction", + "jpeg", + "quality", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfConverter", + "PdfConverter.BindPdf", + "PdfConverter.DoConvert", + "PdfConverter.HasNextImage", + "PdfConverter.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-encrypted-pdf": { + "title": "Extract Images from Encrypted PDF using PdfExtractor", + "filename": "extract-images-from-encrypted-pdf.cs", + "description": "Shows how to provide a user password to Aspose.Pdf.Facades.PdfExtractor and extract all images from an encrypted PDF document.", + "tags": [ + "pdf", + "image-extraction", + "encryption", + "aspose", + "facade" + ], + "apis_used": [ + "PdfExtractor", + "PdfExtractor.Password", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdf-and-encrypt-aes": { + "title": "Extract Text from PDF and Encrypt with AES", + "filename": "extract-text-from-pdf-and-encrypt-aes.cs", + "description": "Demonstrates extracting all text from a PDF using Aspose.Pdf.Facades.PdfExtractor and then encrypting the resulting .txt file with AES‑256 CBC using a password‑derived key.", + "tags": [ + "pdf", + "text-extraction", + "aes", + "encryption", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText", + "Aspose.Pdf.Facades" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "count-pages-images-attachments-pdf": { + "title": "Count Pages, Images, and Attachments in a PDF", + "filename": "count-pages-images-attachments-pdf.cs", + "description": "Demonstrates how to use Aspose.Pdf to obtain the total number of pages, embedded images, and file attachments in a PDF document.", + "tags": [ + "pdf", + "count", + "images", + "attachments", + "aspose" + ], + "apis_used": [ + "Aspose.Pdf.Document", + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Document.Pages.Count", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.ExtractAttachment", + "Aspose.Pdf.Facades.PdfExtractor.GetAttachmentInfo" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-first-pdf-page": { + "title": "Extract Images from First PDF Page to Byte Arrays", + "filename": "extract-images-from-first-pdf-page.cs", + "description": "Demonstrates how to use Aspose.Pdf's PdfExtractor facade to pull all images from the first page of a PDF and store each image directly in a byte array for further processing.", + "tags": [ + "pdf", + "image-extraction", + "byte-array", + "aspose", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.StartPage", + "PdfExtractor.EndPage", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-pages-as-png-thumbnails": { + "title": "Extract PDF Pages as PNG Thumbnails", + "filename": "extract-pdf-pages-as-png-thumbnails.cs", + "description": "Shows how to extract each page of a PDF as a PNG thumbnail with a maximum size of 200 × 200 pixels using Aspose.Pdf.Facades.", + "tags": [ + "pdf", + "thumbnail", + "image-extraction", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfConverter", + "PdfConverter.BindPdf", + "PdfConverter.DoConvert", + "PdfConverter.HasNextImage", + "PdfConverter.GetNextImage", + "PdfConverter.Close" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-pdf-to-word": { + "title": "Extract Images from PDF and Embed into Word Document", + "filename": "extract-images-pdf-to-word.cs", + "description": "Shows how to use Aspose.Pdf's PdfExtractor to pull images from a PDF file and insert them into a DOCX using the Open XML SDK.", + "tags": [ + "pdf", + "image-extraction", + "word", + "openxml", + "aspose" + ], + "apis_used": [ + "PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage", + "PdfExtractor.Close" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf-with-guid-filenames": { + "title": "Extract Images from PDF with GUID Filenames", + "filename": "extract-images-from-pdf-with-guid-filenames.cs", + "description": "Demonstrates how to extract all images from a PDF using Aspose.Pdf.Facades.PdfExtractor and save each image with a unique GUID‑based filename to avoid naming collisions.", + "tags": [ + "pdf", + "image-extraction", + "guid", + "aspose.pdf", + "c#" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-pdf-csv-manifest": { + "title": "Extract Images from PDF and Create CSV Manifest", + "filename": "extract-images-pdf-csv-manifest.cs", + "description": "The example opens a PDF with Aspose.Pdf, extracts each image on every page, saves them as PNG files, and generates a CSV file listing the image file name, page number, and dimensions.", + "tags": [ + "image-extraction", + "pdf", + "csv-manifest", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Document", + "Aspose.Pdf.Page", + "Aspose.Pdf.XImage", + "Aspose.Pdf.Page.Resources.Images", + "Aspose.Pdf.XImage.Save" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-attachments-by-extension": { + "title": "Extract PDF Attachments and Organize by File Extension", + "filename": "extract-pdf-attachments-by-extension.cs", + "description": "Demonstrates how to extract embedded attachments from a PDF using Aspose.Pdf.Facades.PdfExtractor and save them into subfolders grouped by their original file extensions.", + "tags": [ + "pdf", + "attachments", + "extraction", + "aspose", + "file-organization" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractAttachment", + "Aspose.Pdf.Facades.PdfExtractor.GetAttachNames", + "Aspose.Pdf.Facades.PdfExtractor.GetAttachment" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "check-pdf-contains-text": { + "title": "Check if PDF Contains Text via MemoryStream", + "filename": "check-pdf-contains-text.cs", + "description": "Demonstrates using Aspose.Pdf's PdfExtractor to extract text into a MemoryStream and determine whether a PDF file has any textual content.", + "tags": [ + "pdf", + "text-extraction", + "aspose", + "memorystream", + "validation" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-password-protected-pdf": { + "title": "Extract Text from Password-Protected PDF", + "filename": "extract-text-from-password-protected-pdf.cs", + "description": "Shows how to supply the owner password to PdfExtractor, decrypt a protected PDF, and extract its text to a file.", + "tags": [ + "pdf", + "text-extraction", + "password-protected", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.Password", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-to-stringbuilder": { + "title": "Extract PDF Text to StringBuilder", + "filename": "extract-pdf-text-to-stringbuilder.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to extract text from a PDF, store it in a StringBuilder for further manipulation, and then write the processed text to a file.", + "tags": [ + "pdf", + "text-extraction", + "stringbuilder", + "aspose-pdf", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-create-sprite-sheet": { + "title": "Extract Images from PDF and Create Sprite Sheet PNG", + "filename": "extract-images-create-sprite-sheet.cs", + "description": "Shows how to extract all images from a PDF using PdfExtractor and merge them into a single horizontal sprite sheet PNG with PdfConverter.", + "tags": [ + "pdf", + "image-extraction", + "sprite-sheet", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage", + "Aspose.Pdf.Facades.PdfConverter.MergeImages", + "Aspose.Pdf.Drawing.ImageFormat", + "Aspose.Pdf.Facades.ImageMergeMode" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-to-s3-with-sse": { + "title": "Extract Images from PDF and Upload to S3 with SSE", + "filename": "extract-pdf-images-to-s3-with-sse.cs", + "description": "The example extracts all images from a PDF using Aspose.Pdf's PdfExtractor, saves them as temporary PNG files, and uploads each image to an Amazon S3 bucket with server‑side AES‑256 encryption.", + "tags": [ + "pdf", + "image-extraction", + "aws-s3", + "server-side-encryption", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Document", + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdfs-using-pdfextractor": { + "title": "Extract Text from PDFs Using Aspose PdfExtractor", + "filename": "extract-text-from-pdfs-using-pdfextractor.cs", + "description": "Demonstrates a .NET console service that extracts text from PDF files with Aspose.Pdf.Facades.PdfExtractor, handling input/output directories via environment variables and fallback temporary folders.", + "tags": [ + "pdf", + "text-extraction", + "aspose", + "csharp", + "docker" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdf": { + "title": "Extract Text from PDF Using Aspose.Pdf.Facades", + "filename": "extract-text-from-pdf.cs", + "description": "Shows how to bind a PDF file to Aspose.Pdf.Facades.PdfExtractor, extract its text, and write the result to the console.", + "tags": [ + "pdf", + "text-extraction", + "console", + "aspose", + "facades" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "check-pdf-for-text-and-images": { + "title": "Check PDF for Both Text and Images", + "filename": "check-pdf-for-text-and-images.cs", + "description": "Demonstrates how to use Aspose.Pdf's PdfExtractor to determine whether a PDF file contains at least one text element and at least one image.", + "tags": [ + "pdf", + "text-extraction", + "image-extraction", + "content-check", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-to-memorystream-generate-pdf": { + "title": "Extract PDF Text to MemoryStream and Generate New PDF", + "filename": "extract-text-to-memorystream-generate-pdf.cs", + "description": "Shows how to extract all text from a PDF into a MemoryStream using PdfExtractor, then read the stream to create a new PDF with Aspose.Pdf core API.", + "tags": [ + "pdf", + "text-extraction", + "memorystream", + "aspose.pdf", + "pdf-generation" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText", + "Aspose.Pdf.Document", + "Aspose.Pdf.Page", + "Aspose.Pdf.Text.TextFragment", + "Aspose.Pdf.Document.Save" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-to-stringwriter": { + "title": "Extract PDF Text to StringWriter for Logging", + "filename": "extract-pdf-text-to-stringwriter.cs", + "description": "Demonstrates how to extract text from a PDF using Aspose.Pdf.Facades.PdfExtractor, capture it in a MemoryStream, and write the result to a StringWriter for integration with logging frameworks.", + "tags": [ + "pdf", + "text-extraction", + "logging", + "stringwriter", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-to-temp-file-verify": { + "title": "Extract PDF Text to Temporary File and Verify Content", + "filename": "extract-pdf-text-to-temp-file-verify.cs", + "description": "The example extracts all text from a PDF using Aspose.Pdf.Facades.PdfExtractor, writes it to a temporary Unicode text file, reads the file back, and verifies that the extracted text matches the saved content.", + "tags": [ + "pdf", + "text-extraction", + "temporary-file", + "verification", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-first-three-pages-summary-pdf": { + "title": "Extract First Three Pages Text and Create Summary PDF", + "filename": "extract-first-three-pages-summary-pdf.cs", + "description": "Shows how to extract text from the first three pages of a PDF using PdfExtractor (Facades API) and generate a new PDF that contains the extracted text as a summary.", + "tags": [ + "pdf-extraction", + "text-extraction", + "summary", + "facades", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Document", + "Aspose.Pdf.Page", + "Aspose.Pdf.Text.TextFragment", + "Aspose.Pdf.Text.FontRepository.FindFont", + "Aspose.Pdf.Color" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-azure-function": { + "title": "Extract PDF Text via Azure Queue Trigger", + "filename": "extract-pdf-text-azure-function.cs", + "description": "Shows an Azure Function that triggers on a storage queue, downloads a PDF blob, extracts its text with Aspose.Pdf.Facades.PdfExtractor, and uploads the extracted text as a .txt blob.", + "tags": [ + "azure-functions", + "pdf-text-extraction", + "aspose-pdf", + "storage-queue", + "blob-storage" + ], + "apis_used": [], + "difficulty": "", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "configurable-pdf-extraction": { + "title": "Configurable PDF Text, Image, and Attachment Extraction", + "filename": "configurable-pdf-extraction.cs", + "description": "Demonstrates reading a JSON configuration to toggle extraction of text, images, and attachments from a PDF using Aspose.Pdf's PdfExtractor without modifying the code.", + "tags": [ + "pdf", + "extraction", + "configuration", + "aspose-pdf", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText", + "PdfExtractor.ExtractImage", + "PdfExtractor.GetNextImage", + "PdfExtractor.ExtractAttachment", + "PdfExtractor.GetAttachNames" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-to-tiff": { + "title": "Extract Images from PDF and Save as TIFF", + "filename": "extract-images-to-tiff.cs", + "description": "Demonstrates using Aspose.Pdf.Facades.PdfExtractor to pull all images from a PDF and save each one as a lossless TIFF file for archival storage.", + "tags": [ + "pdf", + "image-extraction", + "tiff", + "archival", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdf-bytes": { + "title": "Extract Text from PDF Bytes Using PdfExtractor", + "filename": "extract-text-from-pdf-bytes.cs", + "description": "Demonstrates binding a PDF stored in a byte array to Aspose.Pdf.Facades.PdfExtractor, extracting its text entirely in memory, and handling platform‑specific GDI+ issues with a fallback PDF.", + "tags": [ + "pdf", + "text-extraction", + "memory-stream", + "aspose-pdf", + "fallback" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText", + "Aspose.Pdf.Document", + "Aspose.Pdf.Document.Save", + "Aspose.Pdf.Page", + "Aspose.Pdf.Text.TextFragment" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-validate-signature": { + "title": "Extract Images from PDF and Validate File Signatures", + "filename": "extract-images-validate-signature.cs", + "description": "Demonstrates extracting images from a PDF using Aspose.Pdf.Facades.PdfExtractor, saving them to disk, and verifying their integrity by checking magic numbers (file signatures).", + "tags": [ + "pdf", + "image-extraction", + "validation", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-as-bmp": { + "title": "Extract PDF Images and Save as BMP", + "filename": "extract-pdf-images-as-bmp.cs", + "description": "Demonstrates extracting all images from a PDF with Aspose.Pdf.Facades.PdfExtractor and saving them as BMP files while preserving the original resolution and color depth.", + "tags": [ + "pdf", + "image-extraction", + "bmp", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-to-azure-blob": { + "title": "Extract Images from PDF and Upload to Azure Blob Storage", + "filename": "extract-pdf-images-to-azure-blob.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to pull images from a PDF and then upload each image to an Azure Blob container.", + "tags": [ + "pdf", + "image-extraction", + "azure-blob", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf-using-pdfextractor": { + "title": "Extract Images from PDF Using PdfExtractor with Automatic Disposal", + "filename": "extract-images-from-pdf-using-pdfextractor.cs", + "description": "Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor inside a using block to extract all images from a PDF and save them as JPEG files.", + "tags": [ + "pdf", + "image-extraction", + "pdfextractor", + "csharp", + "using" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-attachments": { + "title": "Extract Attachments from PDF", + "filename": "extract-pdf-attachments.cs", + "description": "Shows how to extract embedded file attachments from a PDF document and write each one to a specified output folder using Aspose.Pdf.", + "tags": [ + "pdf", + "attachments", + "extraction", + "Aspose", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractAttachment", + "PdfExtractor.GetAttachNames", + "PdfExtractor.GetAttachment" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdf-to-utf8-txt": { + "title": "Extract Text from PDF to UTF-8 Text File", + "filename": "extract-text-from-pdf-to-utf8-txt.cs", + "description": "Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to read a PDF, extract its plain text with UTF‑8 encoding, and save the result to a .txt file.", + "tags": [ + "pdf", + "text-extraction", + "utf-8", + "aspose", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-page-by-page": { + "title": "Extract PDF Text Page by Page", + "filename": "extract-pdf-text-page-by-page.cs", + "description": "Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to extract the text of each PDF page and save it to separate .txt files.", + "tags": [ + "pdf", + "text-extraction", + "page-splitting", + "aspose", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.HasNextPageText", + "Aspose.Pdf.Facades.PdfExtractor.GetNextPageText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "batch-extract-text-from-pdfs-azure-blob": { + "title": "Batch Extract Text from PDFs in Azure Blob Storage", + "filename": "batch-extract-text-from-pdfs-azure-blob.cs", + "description": "Demonstrates how to enumerate PDF files in an Azure Blob container, extract their text using Aspose.Pdf.Facades.PdfExtractor, and upload the resulting .txt files back to another container.", + "tags": [ + "pdf", + "text-extraction", + "azure-blob", + "batch-processing", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-resource-defined-images": { + "title": "Extract Images Defined in Resources from PDF", + "filename": "extract-resource-defined-images.cs", + "description": "Demonstrates how to set the ImageExtractionMode to DefinedInResources and use PdfExtractor to retrieve and save images embedded as resources in a PDF document.", + "tags": [ + "pdf", + "image-extraction", + "resources", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImageMode", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "pdf-extractor-text-extraction-unit-test": { + "title": "PdfExtractor Text Extraction Unit Test", + "filename": "pdf-extractor-text-extraction-unit-test.cs", + "description": "Shows how to create a temporary PDF with known text, extract that text using Aspose.Pdf.Facades.PdfExtractor, and verify the result with an NUnit assertion.", + "tags": [ + "pdf", + "text-extraction", + "unit-test", + "aspose-pdf", + "nunit" + ], + "apis_used": [ + "Aspose.Pdf.Document", + "Aspose.Pdf.Page", + "Aspose.Pdf.Text.TextFragment", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-to-unc-share": { + "title": "Extract Images from PDF to UNC Network Share", + "filename": "extract-images-to-unc-share.cs", + "description": "Demonstrates extracting all images from a PDF using Aspose.Pdf.Facades.PdfExtractor and saving them to a UNC network share directory.", + "tags": [ + "pdf", + "image-extraction", + "unc", + "network-share", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdf-byte-array": { + "title": "Extract Text from PDF Byte Array with Aspose.Pdf", + "filename": "extract-text-from-pdf-byte-array.cs", + "description": "Demonstrates how to read a PDF file into a byte array, bind it to a MemoryStream, and use Aspose.Pdf.Facades.PdfExtractor to extract the document text and return it as JSON.", + "tags": [ + "pdf", + "text-extraction", + "aspose", + "byte-array", + "json" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-original-format": { + "title": "Extract Images from PDF in Original Format", + "filename": "extract-images-original-format.cs", + "description": "Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to extract all images from a PDF while preserving their original file format.", + "tags": [ + "pdf", + "image-extraction", + "aspose", + "original-format", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.ExtractImageMode", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-with-cancellation": { + "title": "Extract PDF Text with Cancellation Support", + "filename": "extract-pdf-text-with-cancellation.cs", + "description": "Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to extract text from a PDF page‑by‑page while honoring a CancellationToken to abort the operation.", + "tags": [ + "pdf", + "text-extraction", + "cancellation", + "aspose-pdf", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.HasNextPageText", + "Aspose.Pdf.Facades.PdfExtractor.GetNextPageText", + "Aspose.Pdf.Facades.PdfExtractor.Close", + "Aspose.Pdf.Facades.PdfExtractor.Document.Pages.Count" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdf-using-pdfextractor": { + "title": "Extract Text from PDF using PdfExtractor", + "filename": "extract-text-from-pdf-using-pdfextractor.cs", + "description": "Demonstrates how to bind a PDF file with Aspose.Pdf.Facades.PdfExtractor, extract its text, and output the result to the console.", + "tags": [ + "pdf", + "text-extraction", + "aspose-pdf", + "command-line", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText", + "Aspose.Pdf.Facades.PdfExtractor.Close" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-add-watermark": { + "title": "Extract PDF Images and Add Watermark", + "filename": "extract-pdf-images-add-watermark.cs", + "description": "Demonstrates how to extract all images from a PDF using Aspose.Pdf.Facades.PdfExtractor, apply a semi‑transparent text watermark to each image, and save the results as PNG files.", + "tags": [ + "pdf", + "image-extraction", + "watermark", + "aspose-pdf", + "facades" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf": { + "title": "Extract Images from PDF to Temporary Folder", + "filename": "extract-images-from-pdf.cs", + "description": "Demonstrates how to use Aspose.Pdf's PdfExtractor to pull all images from a PDF and save them into a uniquely created temporary directory.", + "tags": [ + "pdf", + "image-extraction", + "aspose-pdf", + "temporary-folder", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage", + "Aspose.Pdf.Document" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-attachments-sha256": { + "title": "Extract PDF Attachments and Compute SHA-256 Hashes", + "filename": "extract-pdf-attachments-sha256.cs", + "description": "Extracts all file attachments from a PDF using Aspose.Pdf.Facades, saves each attachment to disk, and calculates a SHA-256 hash to verify its integrity.", + "tags": [ + "pdf", + "attachments", + "extraction", + "hash", + "integrity" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractAttachment", + "Aspose.Pdf.Facades.PdfExtractor.GetAttachNames", + "Aspose.Pdf.Facades.PdfExtractor.GetAttachment" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-pdf-upload-s3": { + "title": "Extract Images from PDF and Upload to Amazon S3", + "filename": "extract-images-pdf-upload-s3.cs", + "description": "Demonstrates extracting images from a PDF using Aspose.Pdf.Facades.PdfExtractor and uploading each image to an Amazon S3 bucket via the AWS SDK.", + "tags": [ + "pdf", + "image-extraction", + "aws-s3", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-by-keyword": { + "title": "Extract Images from Pages Containing a Keyword", + "filename": "extract-images-by-keyword.cs", + "description": "Shows how to scan each PDF page for a specific keyword and, when the keyword is found, extract all images from that page using Aspose.Pdf's PdfExtractor.", + "tags": [ + "pdf", + "image-extraction", + "keyword-search", + "aspose-pdf", + "pdfextractor" + ], + "apis_used": [ + "Aspose.Pdf.Document", + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-to-postgres": { + "title": "Extract PDF Text and Store in PostgreSQL", + "filename": "extract-pdf-text-to-postgres.cs", + "description": "Demonstrates how to extract Unicode text from a PDF using Aspose.Pdf.Facades.PdfExtractor and insert the extracted content into a PostgreSQL table for searchable indexing.", + "tags": [ + "pdf", + "text-extraction", + "postgresql", + "aspose", + "database" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText", + "Aspose.Pdf.Facades.PdfExtractor.Dispose" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf-csv": { + "title": "Extract Images from PDF and Generate CSV Report", + "filename": "extract-images-from-pdf-csv.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to extract images from each PDF page, save them as PNG files, and produce a CSV file with image filenames, page numbers, and dimensions.", + "tags": [ + "pdf", + "image-extraction", + "csv", + "aspose-pdf", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage", + "Aspose.Pdf.Facades.PdfExtractor.StartPage", + "Aspose.Pdf.Facades.PdfExtractor.EndPage", + "Aspose.Pdf.Facades.PdfExtractor.Document" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-only-from-pdf": { + "title": "Extract Text Only from PDF Using PdfExtractor", + "filename": "extract-text-only-from-pdf.cs", + "description": "Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to extract only the textual content from a PDF while ignoring images by omitting the ExtractImage call.", + "tags": [ + "pdf", + "text-extraction", + "aspose", + "pdfextractor", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText", + "PdfExtractor.Close" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "list-embedded-attachment-names": { + "title": "List Embedded Attachment Names from PDF", + "filename": "list-embedded-attachment-names.cs", + "description": "Demonstrates how to bind a PDF with PdfExtractor, extract its attachments, and retrieve the names of all embedded files without saving them.", + "tags": [ + "pdf", + "attachments", + "list", + "aspose-pdf", + "extractor" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractAttachment", + "Aspose.Pdf.Facades.PdfExtractor.GetAttachNames" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-text-from-pdf-pages-with-progress": { + "title": "Extract Text from PDF Pages with Progress Reporting", + "filename": "extract-text-from-pdf-pages-with-progress.cs", + "description": "Demonstrates using Aspose.Pdf.Facades.PdfExtractor to extract text from each page of a PDF while reporting progress to the console.", + "tags": [ + "pdf", + "text-extraction", + "progress", + "aspose-pdf", + "console" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.HasNextPageText", + "PdfExtractor.GetNextPageText", + "PdfExtractor.Document" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-to-azure-table": { + "title": "Extract PDF Text and Store in Azure Table", + "filename": "extract-pdf-text-to-azure-table.cs", + "description": "Shows how to extract text from a PDF using Aspose.Pdf.Facades.PdfExtractor and save the extracted content as an Azure Table entity with the document ID as the partition key.", + "tags": [ + "pdf", + "text-extraction", + "azure-table", + "aspose", + "facades" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText", + "Aspose.Pdf.Facades.PdfExtractor.Dispose" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "async-pdf-extraction": { + "title": "Asynchronous PDF Text, Image, and Attachment Extraction", + "filename": "async-pdf-extraction.cs", + "description": "Demonstrates how to extract text, images, and embedded attachments from a PDF file asynchronously using Aspose.Pdf's PdfExtractor to keep UI threads responsive.", + "tags": [ + "async", + "pdf", + "extraction", + "images", + "text" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage", + "PdfExtractor.ExtractAttachment" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf-to-zip": { + "title": "Extract Images from PDF to ZIP Archive", + "filename": "extract-images-from-pdf-to-zip.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to pull all images out of a PDF and then package those images into a ZIP file.", + "tags": [ + "pdf", + "image-extraction", + "zip", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-specific-pdf-page": { + "title": "Extract Images from a Specific PDF Page", + "filename": "extract-images-from-specific-pdf-page.cs", + "description": "Shows how to extract all images from a single PDF page by setting the PdfExtractor's StartPage and EndPage to the same value.", + "tags": [ + "pdf", + "image-extraction", + "single-page", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.StartPage", + "PdfExtractor.EndPage", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-base64": { + "title": "Extract PDF Images and Return as Base64 JSON", + "filename": "extract-pdf-images-base64.cs", + "description": "Demonstrates extracting all images from a PDF using Aspose.Pdf.Facades.PdfExtractor, converting each image to a Base64 string, and serializing the collection to a JSON array.", + "tags": [ + "pdf", + "image-extraction", + "base64", + "json", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-markdown-gallery": { + "title": "Extract Images from PDF and Generate Markdown Gallery", + "filename": "extract-images-markdown-gallery.cs", + "description": "Demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to pull images from a PDF, save them as PNG files, and create a Markdown file that displays the images as a gallery.", + "tags": [ + "pdf", + "image-extraction", + "markdown", + "gallery", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "handle-errors-binding-corrupted-pdf": { + "title": "Handle Errors When Binding a Corrupted PDF with PdfExtractor", + "filename": "handle-errors-binding-corrupted-pdf.cs", + "description": "Demonstrates how to catch specific Aspose.Pdf exceptions thrown when PdfExtractor fails to bind a corrupted PDF file, providing detailed error messages.", + "tags": [ + "pdf", + "error-handling", + "aspose", + "pdfextractor", + "exceptions" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.InvalidPdfFileFormatException", + "Aspose.Pdf.ObjectReferenceCorruptedException", + "Aspose.Pdf.PdfException" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "batch-extract-text-from-pdfs": { + "title": "Batch Extract Text from PDFs", + "filename": "batch-extract-text-from-pdfs.cs", + "description": "Shows how to loop through a folder of PDF files, use Aspose.Pdf.Facades.PdfExtractor to extract text from each document, and save the output as .txt files with matching filenames.", + "tags": [ + "pdf", + "text-extraction", + "batch-processing", + "aspose-pdf", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-to-html-report": { + "title": "Extract Images from PDF and Embed in HTML Report", + "filename": "extract-images-to-html-report.cs", + "description": "Shows how to extract all images from a PDF using Aspose.Pdf.Facades.PdfExtractor and embed them as base64 data URIs in an HTML file.", + "tags": [ + "pdf", + "image-extraction", + "html", + "base64", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-to-unc-share__v2": { + "title": "Extract Images from PDF to UNC Network Share", + "filename": "extract-images-to-unc-share__v2.cs", + "description": "Demonstrates extracting all images from a PDF using Aspose.Pdf.Facades.PdfExtractor and writing them to a network‑mapped UNC folder.", + "tags": [ + "pdf", + "image-extraction", + "unc", + "network-share", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-all-images-from-pdf": { + "title": "Extract All Images from PDF Using PdfExtractor", + "filename": "extract-all-images-from-pdf.cs", + "description": "Demonstrates how to extract every image from a PDF document by setting the page range to all pages and iterating through the extracted images with Aspose.Pdf.Facades.PdfExtractor.", + "tags": [ + "pdf", + "image-extraction", + "aspose", + "facade", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.StartPage", + "PdfExtractor.EndPage", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "batch-extract-text-from-pdfs__v2": { + "title": "Batch Extract Text from PDFs using PdfExtractor", + "filename": "batch-extract-text-from-pdfs__v2.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to extract text from every PDF in a directory and write each result to a corresponding .txt file.", + "tags": [ + "pdf", + "text-extraction", + "batch-processing", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-ocr-openai": { + "title": "Extract Images from PDF and Perform OCR with OpenAI", + "filename": "extract-images-ocr-openai.cs", + "description": "Demonstrates extracting all images from a PDF using PdfExtractor and then applying Aspose.Pdf.AI OCR via OpenAI to retrieve hidden text from each image.", + "tags": [ + "pdf", + "image-extraction", + "ocr", + "openai", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage", + "Aspose.Pdf.Document", + "Aspose.Pdf.AI.OpenAIClient.CreateWithApiKey", + "Aspose.Pdf.AI.OpenAIOcrCopilotOptions.WithDocument", + "Aspose.Pdf.AI.AICopilotFactory.CreateOcrCopilot" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "verify-image-extraction-mode-count": { + "title": "Verify ImageExtractionMode Affects Extracted Image Count", + "filename": "verify-image-extraction-mode-count.cs", + "description": "Creates a PDF with two images and uses PdfExtractor with different ExtractImageMode settings to ensure the extraction mode influences the number of images returned, demonstrated via an NUnit unit test.", + "tags": [ + "image-extraction", + "pdf", + "unit-test", + "aspose-pdf", + "extract-image-mode" + ], + "apis_used": [ + "Aspose.Pdf.Document", + "Aspose.Pdf.Page", + "Aspose.Pdf.Image", + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.ExtractImageMode", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-to-gzip": { + "title": "Extract PDF Text and Compress to GZip", + "filename": "extract-pdf-text-to-gzip.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to extract all text from a PDF and write it directly into a GZipStream, creating a compressed .txt.gz file.", + "tags": [ + "pdf", + "text-extraction", + "gzip", + "compression", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractText", + "Aspose.Pdf.Facades.PdfExtractor.GetText" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf-pages-png": { + "title": "Extract Images from Specific PDF Pages as PNG", + "filename": "extract-images-from-pdf-pages-png.cs", + "description": "Demonstrates how to configure PdfExtractor to extract only images from pages 5‑10 of a PDF and save each extracted image as a PNG file.", + "tags": [ + "pdf", + "image-extraction", + "pages", + "png", + "aspose" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.StartPage", + "Aspose.Pdf.Facades.PdfExtractor.EndPage", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage", + "Aspose.Pdf.Facades.PdfExtractor.Close" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-create-portfolio-pdf": { + "title": "Extract Images from PDF and Create Portfolio PDF", + "filename": "extract-images-create-portfolio-pdf.cs", + "description": "Demonstrates extracting all images from a source PDF using Aspose.Pdf's PdfExtractor and assembling them into a new PDF portfolio where each image occupies its own page.", + "tags": [ + "pdf", + "image-extraction", + "portfolio", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.GetNextImage", + "Document", + "Page", + "Image", + "Document.Save" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "parallel-extraction-of-images-from-multiple-pdfs": { + "title": "Parallel Extraction of Images from Multiple PDFs", + "filename": "parallel-extraction-of-images-from-multiple-pdfs.cs", + "description": "Demonstrates how to extract images from several PDF files concurrently using Aspose.Pdf.Facades.PdfExtractor and Task.WhenAll for improved throughput.", + "tags": [ + "pdf", + "image-extraction", + "parallel", + "aspose-pdf", + "task" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.HasNextImage", + "PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-create-pdfa2b": { + "title": "Extract Images and Create PDF/A‑2b Document", + "filename": "extract-images-create-pdfa2b.cs", + "description": "Shows how to extract images from a PDF using PdfExtractor, embed each image as an XObject on a new page, and convert the resulting document to PDF/A‑2b compliance.", + "tags": [ + "pdf-extraction", + "pdfa", + "image-embedding", + "aspose-pdf", + "facade" + ], + "apis_used": [ + "PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractImage", + "PdfExtractor.GetNextImage", + "Document", + "Document.Convert", + "Page.AddImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf-and-compress-png": { + "title": "Extract Images from PDF and Compress PNGs", + "filename": "extract-images-from-pdf-and-compress-png.cs", + "description": "The example demonstrates how to use Aspose.Pdf.Facades.PdfExtractor to extract images from a PDF, save them as PNG files, and then compress each PNG using GZip for lossless compression.", + "tags": [ + "pdf", + "image-extraction", + "png", + "compression", + "aspose" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-from-pdf-to-gcs": { + "title": "Extract Images from PDF and Upload to Google Cloud Storage", + "filename": "extract-images-from-pdf-to-gcs.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to pull images from a PDF and then upload each image to a Google Cloud Storage bucket with public read permissions.", + "tags": [ + "pdf", + "image-extraction", + "google-cloud-storage", + "aspose", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-attachments-rename-timestamp": { + "title": "Extract PDF Attachments and Rename with Timestamp", + "filename": "extract-pdf-attachments-rename-timestamp.cs", + "description": "Demonstrates how to extract all attachments from a PDF using Aspose.Pdf.Facades, then save each file with a unique timestamp‑based name.", + "tags": [ + "pdf", + "attachments", + "extraction", + "timestamp", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractAttachment", + "PdfExtractor.GetAttachNames", + "PdfExtractor.GetAttachment" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-images-pdf-to-base64-json": { + "title": "Extract Images from PDF to Base64 JSON", + "filename": "extract-images-pdf-to-base64-json.cs", + "description": "Demonstrates how to extract all images from a PDF using Aspose.Pdf.Facades, convert them to Base64 strings, and save the data with metadata in a JSON file.", + "tags": [ + "pdf", + "image-extraction", + "base64", + "json", + "aspose" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "beginner", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-create-video-slideshow": { + "title": "Extract PDF Images and Build Video Slideshow", + "filename": "extract-pdf-images-create-video-slideshow.cs", + "description": "The example extracts all images from a PDF using Aspose.Pdf's PdfExtractor, saves them as PNG files, and then assembles them into a video slideshow with FFmpeg.", + "tags": [ + "pdf", + "image-extraction", + "ffmpeg", + "slideshow", + "aspose-pdf" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "create-contact-sheet-pdf": { + "title": "Create Contact Sheet PDF from Extracted Images", + "filename": "create-contact-sheet-pdf.cs", + "description": "Extracts all images from a source PDF and generates a new PDF that displays those images as thumbnails arranged in a grid layout.", + "tags": [ + "image-extraction", + "contact-sheet", + "thumbnail", + "pdf-generation", + "grid-layout" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage", + "Aspose.Pdf.Document", + "Aspose.Pdf.Page.AddImage", + "Aspose.Pdf.Rectangle" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-text-page-by-page-progress": { + "title": "Extract PDF Text Page‑by‑Page with Progress Indicator", + "filename": "extract-pdf-text-page-by-page-progress.cs", + "description": "Shows how to use Aspose.Pdf.Facades.PdfExtractor to extract each PDF page's text into separate files while displaying a real‑time progress percentage.", + "tags": [ + "pdf", + "text-extraction", + "page-by-page", + "progress", + "facade" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "PdfExtractor.BindPdf", + "PdfExtractor.ExtractText", + "PdfExtractor.HasNextPageText", + "PdfExtractor.GetNextPageText", + "PdfExtractor.StartPage", + "PdfExtractor.EndPage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + }, + "extract-pdf-images-html-gallery": { + "title": "Extract PDF Images and Build HTML Gallery", + "filename": "extract-pdf-images-html-gallery.cs", + "description": "Demonstrates how to extract all images from a PDF using Aspose.Pdf.Facades.PdfExtractor and generate a simple HTML page that displays the extracted images.", + "tags": [ + "pdf", + "image-extraction", + "html-gallery", + "aspose-pdf", + "csharp" + ], + "apis_used": [ + "Aspose.Pdf.Facades.PdfExtractor", + "Aspose.Pdf.Facades.PdfExtractor.BindPdf", + "Aspose.Pdf.Facades.PdfExtractor.ExtractImage", + "Aspose.Pdf.Facades.PdfExtractor.HasNextImage", + "Aspose.Pdf.Facades.PdfExtractor.GetNextImage" + ], + "difficulty": "intermediate", + "generated_at": "2026-05-08T09:44:37Z", + "status": "verified" + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/list-embedded-attachment-names.cs b/facades-extract-images-and-text/list-embedded-attachment-names.cs new file mode 100644 index 00000000..96b1d937 --- /dev/null +++ b/facades-extract-images-and-text/list-embedded-attachment-names.cs @@ -0,0 +1,30 @@ +using System; +using System.Collections.Generic; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPath = "input.pdf"; + + if (!System.IO.File.Exists(inputPath)) + { + Console.Error.WriteLine($"File not found: {inputPath}"); + return; + } + + // Create PdfExtractor, bind the PDF, extract attachments, then list their names. + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPath); + extractor.ExtractAttachment(); // Must be called before GetAttachNames() + IList attachmentNames = extractor.GetAttachNames(); + + foreach (string name in attachmentNames) + { + Console.WriteLine(name); + } + } + } +} \ No newline at end of file diff --git a/facades-extract-images-and-text/parallel-extraction-of-images-from-multiple-pdfs.cs b/facades-extract-images-and-text/parallel-extraction-of-images-from-multiple-pdfs.cs new file mode 100644 index 00000000..fb67c43b --- /dev/null +++ b/facades-extract-images-and-text/parallel-extraction-of-images-from-multiple-pdfs.cs @@ -0,0 +1,59 @@ +using System; +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using Aspose.Pdf.Facades; + +class Program +{ + // Entry point – runs the parallel extraction + static async Task Main(string[] args) + { + // Example PDF files – replace with your own paths or pass via args + string[] pdfFiles = { "doc1.pdf", "doc2.pdf", "doc3.pdf" }; + string outputRoot = "ExtractedImages"; + + await ExtractImagesFromMultiplePdfsAsync(pdfFiles, outputRoot); + } + + // Extracts images from each PDF in parallel using Task.WhenAll + static async Task ExtractImagesFromMultiplePdfsAsync(string[] pdfPaths, string outputRoot) + { + // Ensure the root output directory exists + Directory.CreateDirectory(outputRoot); + + // Create a task for each PDF file + Task[] extractionTasks = pdfPaths.Select(pdfPath => Task.Run(() => ExtractImagesFromPdf(pdfPath, outputRoot))).ToArray(); + + // Await completion of all extraction tasks + await Task.WhenAll(extractionTasks); + } + + // Helper that extracts images from a single PDF file + private static void ExtractImagesFromPdf(string pdfPath, string outputRoot) + { + if (string.IsNullOrWhiteSpace(pdfPath) || !File.Exists(pdfPath)) + return; // skip invalid entries + + // Create a subfolder for each PDF to avoid filename collisions + string pdfName = Path.GetFileNameWithoutExtension(pdfPath); + string pdfOutputDir = Path.Combine(outputRoot, pdfName); + Directory.CreateDirectory(pdfOutputDir); + + // Use PdfExtractor (Facade) to extract images + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(pdfPath); // Bind the source PDF + extractor.ExtractImage(); // Prepare image extraction + + int imageIndex = 1; + while (extractor.HasNextImage()) + { + // Save each extracted image; the extension determines the format + string imageFile = Path.Combine(pdfOutputDir, $"image-{imageIndex}.png"); + extractor.GetNextImage(imageFile); + imageIndex++; + } + } + } +} diff --git a/facades-extract-images-and-text/pdf-extractor-text-extraction-unit-test.cs b/facades-extract-images-and-text/pdf-extractor-text-extraction-unit-test.cs new file mode 100644 index 00000000..60a599ba --- /dev/null +++ b/facades-extract-images-and-text/pdf-extractor-text-extraction-unit-test.cs @@ -0,0 +1,103 @@ +using System; +using System.IO; +using System.Text; +using Aspose.Pdf; // Document, Page, TextFragment +using Aspose.Pdf.Text; // TextFragment +using Aspose.Pdf.Facades; // PdfExtractor + +// Minimal NUnit stubs to allow compilation without the real NUnit package. +namespace NUnit.Framework +{ + [AttributeUsage(AttributeTargets.Class)] + public sealed class TestFixtureAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class TestAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class SetUpAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class TearDownAttribute : Attribute { } + + public static class Assert + { + public static void AreEqual(T expected, T actual, string message = null) + { + if (!object.Equals(expected, actual)) + throw new Exception(message ?? $"Assert.AreEqual failed. Expected:<{expected}>. Actual:<{actual}>."); + } + } +} + +namespace AsposePdfTests +{ + [NUnit.Framework.TestFixture] + public class PdfExtractorTests + { + // Initialise with a non‑null default to satisfy nullable analysis. + private string _tempPdfPath = string.Empty; + private const string SampleText = "Hello World from Aspose.Pdf!"; + + [NUnit.Framework.SetUp] + public void SetUp() + { + // Create a temporary PDF file with known text content. + _tempPdfPath = Path.Combine(Path.GetTempPath(), $"sample_{Guid.NewGuid()}.pdf"); + + // Use the recommended Document creation pattern (wrapped in using). + using (Document doc = new Document()) + { + // Add a page and a text fragment. + Page page = doc.Pages.Add(); + TextFragment tf = new TextFragment(SampleText); + page.Paragraphs.Add(tf); + + // Save the PDF to the temporary location. + doc.Save(_tempPdfPath); + } + } + + [NUnit.Framework.TearDown] + public void TearDown() + { + // Clean up the temporary PDF file. + if (File.Exists(_tempPdfPath)) + { + File.Delete(_tempPdfPath); + } + } + + [NUnit.Framework.Test] + public void ExtractText_ShouldReturnExactContent() + { + // Initialize the extractor and bind the PDF file. + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(_tempPdfPath); + + // Perform text extraction. + extractor.ExtractText(); + + // Retrieve the extracted text into a memory stream. + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetText(ms); + string extracted = Encoding.Unicode.GetString(ms.ToArray()); + + // Verify that the extracted text matches the original content. + NUnit.Framework.Assert.AreEqual(SampleText, extracted.Trim()); + } + + // Explicitly close the facade (optional, as using will dispose). + extractor.Close(); + } + } + } + + // Dummy entry point to satisfy the compiler when the project is built as an executable. + public static class Program + { + public static void Main() { /* No‑op – tests are executed by the test runner */ } + } +} diff --git a/facades-extract-images-and-text/retry-pdf-text-extraction.cs b/facades-extract-images-and-text/retry-pdf-text-extraction.cs new file mode 100644 index 00000000..f9a84756 --- /dev/null +++ b/facades-extract-images-and-text/retry-pdf-text-extraction.cs @@ -0,0 +1,58 @@ +using System; +using System.IO; +using System.Threading; +using Aspose.Pdf; +using Aspose.Pdf.Facades; + +class Program +{ + static void Main() + { + const string inputPdf = "input.pdf"; + const string outputTxt = "output.txt"; + + if (!File.Exists(inputPdf)) + { + Console.Error.WriteLine($"Input file not found: {inputPdf}"); + return; + } + + bool extracted = false; + + // Retry up to three times if an IOException occurs + for (int attempt = 1; attempt <= 3 && !extracted; attempt++) + { + try + { + // Create and bind the extractor (lifecycle: create → bind → extract → save) + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(inputPdf); // load PDF + extractor.ExtractText(); // perform extraction + extractor.GetText(outputTxt); // save extracted text + } + + extracted = true; + Console.WriteLine($"Extraction succeeded on attempt {attempt}."); + } + catch (IOException ioEx) // retry only on I/O errors + { + Console.Error.WriteLine($"IO exception on attempt {attempt}: {ioEx.Message}"); + if (attempt == 3) + { + Console.Error.WriteLine("All retry attempts failed."); + } + else + { + // Optional short delay before next attempt + Thread.Sleep(500); + } + } + catch (PdfException pdfEx) // non‑IO PDF errors are not retried + { + Console.Error.WriteLine($"PDF processing error: {pdfEx.Message}"); + break; + } + } + } +} diff --git a/facades-extract-images-and-text/verify-image-extraction-mode-count.cs b/facades-extract-images-and-text/verify-image-extraction-mode-count.cs new file mode 100644 index 00000000..bfa7ea15 --- /dev/null +++ b/facades-extract-images-and-text/verify-image-extraction-mode-count.cs @@ -0,0 +1,161 @@ +using System; +using System.IO; +using System.Collections.Generic; +using Aspose.Pdf; +using Aspose.Pdf.Facades; +using NUnit.Framework; // Added reference to NUnit stubs + +// Minimal NUnit stubs – used when the real NUnit package is not referenced. +namespace NUnit.Framework +{ + [AttributeUsage(AttributeTargets.Class)] + public sealed class TestFixtureAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class TestAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class OneTimeSetUpAttribute : Attribute { } + + [AttributeUsage(AttributeTargets.Method)] + public sealed class OneTimeTearDownAttribute : Attribute { } + + public static class Assert + { + public static void AreEqual(T expected, T actual, string? message = null) + { + if (!object.Equals(expected, actual)) + throw new Exception(message ?? $"Assert.AreEqual failed. Expected:<{expected}>. Actual:<{actual}>."); + } + + // Overload for int to match typical usage without generic type inference. + public static void AreEqual(int expected, int actual, string? message = null) => AreEqual(expected, actual, message); + + public static void Greater(int actual, int expected, string? message = null) + { + if (!(actual > expected)) + throw new Exception(message ?? $"Assert.Greater failed. Expected > {expected}, but was {actual}."); + } + } +} + +namespace AsposePdfTests +{ + [TestFixture] + public class ImageExtractionModeTests + { + private string _pdfPath; + private string _imagePath; + + // Minimal 1x1 PNG (transparent) encoded in base64 + private const string Base64Png = + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/5+BAQAE/wJ" + + "ZcKcAAAAASUVORK5CYII="; + + [OneTimeSetUp] + public void SetUp() + { + // Create temporary folder + string tempDir = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); + Directory.CreateDirectory(tempDir); + + // Write PNG file + _imagePath = Path.Combine(tempDir, "sample.png"); + byte[] pngBytes = Convert.FromBase64String(Base64Png); + File.WriteAllBytes(_imagePath, pngBytes); + + // Create PDF with two images on the first page + _pdfPath = Path.Combine(tempDir, "sample.pdf"); + using (Document doc = new Document()) + { + Page page = doc.Pages.Add(); + + // First image + Aspose.Pdf.Image img1 = new Aspose.Pdf.Image + { + File = _imagePath, + // Position the image + HorizontalAlignment = HorizontalAlignment.Center, + VerticalAlignment = VerticalAlignment.Center + }; + page.Paragraphs.Add(img1); + + // Second image (same file, different position) + Aspose.Pdf.Image img2 = new Aspose.Pdf.Image + { + File = _imagePath, + HorizontalAlignment = HorizontalAlignment.Right, + VerticalAlignment = VerticalAlignment.Bottom + }; + page.Paragraphs.Add(img2); + + doc.Save(_pdfPath); + } + } + + [OneTimeTearDown] + public void TearDown() + { + try + { + if (File.Exists(_pdfPath)) File.Delete(_pdfPath); + if (File.Exists(_imagePath)) File.Delete(_imagePath); + string dir = Path.GetDirectoryName(_pdfPath); + if (Directory.Exists(dir)) Directory.Delete(dir, true); + } + catch + { + // Ignored – cleanup failure should not affect test results + } + } + + private int CountExtractedImages(ExtractImageMode mode) + { + int count = 0; + using (PdfExtractor extractor = new PdfExtractor()) + { + extractor.BindPdf(_pdfPath); + extractor.ExtractImageMode = mode; + extractor.ExtractImage(); + + while (extractor.HasNextImage()) + { + // Use a dummy stream; we only need to advance the iterator + using (MemoryStream ms = new MemoryStream()) + { + extractor.GetNextImage(ms); + } + count++; + } + } + return count; + } + + [Test] + public void Verify_ImageExtractionMode_Influences_Count() + { + // Count images when extracting all defined resources + int definedInResourcesCount = CountExtractedImages(ExtractImageMode.DefinedInResources); + + // Count images when extracting only actually used images + int actuallyUsedCount = CountExtractedImages(ExtractImageMode.ActuallyUsed); + + // Both counts should be greater than zero (images are present) + Assert.Greater(definedInResourcesCount, 0, "No images were extracted with DefinedInResources mode."); + Assert.Greater(actuallyUsedCount, 0, "No images were extracted with ActuallyUsed mode."); + + // For this simple PDF the counts are expected to be equal + Assert.AreEqual(definedInResourcesCount, actuallyUsedCount, + "Image counts differ between extraction modes for a PDF where all images are used."); + } + } +} + +// Dummy entry point to satisfy the compiler when the project is built as an executable. +public static class Program +{ + public static void Main(string[] args) + { + // No operation – tests are executed via the test runner. + } +}