forked from ShreyashTailor/quiz-php
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_extractor.php
More file actions
107 lines (88 loc) · 3.82 KB
/
Copy pathpdf_extractor.php
File metadata and controls
107 lines (88 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
<?php
/**
* Simple PDF Text Extractor
* This is a basic implementation for demonstration purposes.
* For production use, consider using libraries like TCPDF, FPDF, or pdf2text
*/
class SimplePDFExtractor {
public static function extractText($pdfPath) {
// This is a simplified approach
// In production, you should use a proper PDF parsing library
try {
// Try to read basic text content from PDF
$content = file_get_contents($pdfPath);
// Simple text extraction using regex (very basic)
// This won't work for all PDFs, but it's a starting point
$text = '';
// Look for text objects in PDF
if (preg_match_all('/\(([^)]+)\)/', $content, $matches)) {
$text = implode(' ', $matches[1]);
}
// If no text found using simple method, try alternative approach
if (empty(trim($text))) {
// Try to find stream objects
if (preg_match_all('/stream\s*\n(.*?)\nendstream/s', $content, $matches)) {
foreach ($matches[1] as $stream) {
// Basic text extraction from streams
$decoded = @gzuncompress($stream);
if ($decoded !== false) {
$text .= $decoded . ' ';
}
}
}
}
// Clean up the extracted text
$text = preg_replace('/[^\x20-\x7E\n\r\t]/', '', $text);
$text = preg_replace('/\s+/', ' ', $text);
$text = trim($text);
// If still no meaningful text, return a placeholder with filename
if (empty($text) || strlen($text) < 50) {
$filename = basename($pdfPath);
$text = "This PDF document ($filename) contains content that requires advanced text extraction. ";
$text .= "The document appears to contain educational material suitable for quiz generation. ";
$text .= "Please create questions based on typical topics that might be covered in such a document.";
}
return $text;
} catch (Exception $e) {
// Return a generic message if extraction fails
$filename = basename($pdfPath);
return "Unable to extract text from PDF ($filename). Please create general quiz questions based on the document title and common educational topics.";
}
}
/**
* Alternative method using pdftotext if available
* Requires pdftotext to be installed on the server
*/
public static function extractTextWithPdfToText($pdfPath) {
$command = "pdftotext '$pdfPath' -";
$text = shell_exec($command);
if ($text && strlen(trim($text)) > 50) {
return trim($text);
}
// Fallback to simple extraction
return self::extractText($pdfPath);
}
/**
* Get PDF metadata
*/
public static function getPDFInfo($pdfPath) {
$info = [];
$content = file_get_contents($pdfPath);
// Extract title
if (preg_match('/\/Title\s*\(([^)]+)\)/', $content, $matches)) {
$info['title'] = $matches[1];
}
// Extract author
if (preg_match('/\/Author\s*\(([^)]+)\)/', $content, $matches)) {
$info['author'] = $matches[1];
}
// Extract subject
if (preg_match('/\/Subject\s*\(([^)]+)\)/', $content, $matches)) {
$info['subject'] = $matches[1];
}
$info['filesize'] = filesize($pdfPath);
$info['filename'] = basename($pdfPath);
return $info;
}
}
?>