|
3 | 3 | * Handles text file detection, reading, and validation
|
4 | 4 | */
|
5 | 5 |
|
| 6 | +import { |
| 7 | + DEFAULT_BINARY_DETECTION_OPTIONS, |
| 8 | + type BinaryDetectionOptions |
| 9 | +} from '$lib/constants/binary-detection'; |
6 | 10 | import { FileExtensionText } from '$lib/enums/files';
|
7 | 11 |
|
8 | 12 | /**
|
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> {
|
43 | 47 | * Heuristic check to determine if content is likely from a text file
|
44 | 48 | * Detects binary files by counting suspicious characters and null bytes
|
45 | 49 | * @param content - The file content to analyze
|
| 50 | + * @param options - Optional configuration for detection parameters |
46 | 51 | * @returns True if the content appears to be text-based
|
47 | 52 | */
|
48 |
| -export function isLikelyTextFile(content: string): boolean { |
| 53 | +export function isLikelyTextFile( |
| 54 | + content: string, |
| 55 | + options: Partial<BinaryDetectionOptions> = {} |
| 56 | +): boolean { |
49 | 57 | if (!content) return true;
|
50 | 58 |
|
51 |
| - const sample = content.substring(0, 1000); |
| 59 | + const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options }; |
| 60 | + const sample = content.substring(0, config.prefixLength); |
52 | 61 |
|
53 |
| - let suspiciousCount = 0; |
54 | 62 | let nullCount = 0;
|
| 63 | + let suspiciousControlCount = 0; |
55 | 64 |
|
56 | 65 | for (let i = 0; i < sample.length; i++) {
|
57 | 66 | const charCode = sample.charCodeAt(i);
|
58 | 67 |
|
59 |
| - // Count null bytes |
| 68 | + // Count null bytes - these are strong indicators of binary files |
60 | 69 | if (charCode === 0) {
|
61 | 70 | nullCount++;
|
62 |
| - suspiciousCount++; |
63 | 71 |
|
64 | 72 | continue;
|
65 | 73 | }
|
66 | 74 |
|
67 |
| - // Count suspicious control characters (excluding common ones like tab, newline, carriage return) |
| 75 | + // Count suspicious control characters |
| 76 | + // Allow common whitespace characters: tab (9), newline (10), carriage return (13) |
68 | 77 | if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
|
69 |
| - suspiciousCount++; |
| 78 | + // Count most suspicious control characters |
| 79 | + if (charCode < 8 || (charCode > 13 && charCode < 27)) { |
| 80 | + suspiciousControlCount++; |
| 81 | + } |
70 | 82 | }
|
71 | 83 |
|
72 | 84 | // Count replacement characters (indicates encoding issues)
|
73 | 85 | if (charCode === 0xfffd) {
|
74 |
| - suspiciousCount++; |
| 86 | + suspiciousControlCount++; |
75 | 87 | }
|
76 | 88 | }
|
77 | 89 |
|
78 |
| - // Reject if too many null bytes or suspicious characters |
79 |
| - if (nullCount > 2) return false; |
80 |
| - if (suspiciousCount / sample.length > 0.1) return false; |
| 90 | + // Reject if too many null bytes |
| 91 | + if (nullCount > config.maxAbsoluteNullBytes) return false; |
| 92 | + |
| 93 | + // Reject if too many suspicious characters |
| 94 | + if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false; |
81 | 95 |
|
82 | 96 | return true;
|
83 | 97 | }
|
0 commit comments