Skip to content

Commit 807e8c6

Browse files
authored
Enhance text file detection logic for file attachments (#16199)
* feat: Enhances text file detection logic * chore: Build static `webui` output * chore: update webui build output
1 parent 1a18927 commit 807e8c6

File tree

5 files changed

+56
-14
lines changed

5 files changed

+56
-14
lines changed

tools/server/public/index.html.gz

487 Bytes
Binary file not shown.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
export interface BinaryDetectionOptions {
2+
/** Number of characters to check from the beginning of the file */
3+
prefixLength: number;
4+
/** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
5+
suspiciousCharThresholdRatio: number;
6+
/** Maximum absolute number of null bytes allowed */
7+
maxAbsoluteNullBytes: number;
8+
}
9+
10+
export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = {
11+
prefixLength: 1024 * 10, // Check the first 10KB of the string
12+
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
13+
maxAbsoluteNullBytes: 2
14+
};

tools/server/webui/src/lib/constants/supported-file-types.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,5 +176,13 @@ export const TEXT_FILE_TYPES = {
176176
[FileTypeText.SVELTE]: {
177177
extensions: [FileExtensionText.SVELTE],
178178
mimeTypes: [MimeTypeText.SVELTE]
179+
},
180+
[FileTypeText.LATEX]: {
181+
extensions: [FileExtensionText.TEX],
182+
mimeTypes: [MimeTypeText.LATEX]
183+
},
184+
[FileTypeText.BIBTEX]: {
185+
extensions: [FileExtensionText.BIB],
186+
mimeTypes: [MimeTypeText.BIBTEX]
179187
}
180188
} as const;

tools/server/webui/src/lib/enums/files.ts

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,9 @@ export enum FileTypeText {
5959
SWIFT = 'swift',
6060
DART = 'dart',
6161
VUE = 'vue',
62-
SVELTE = 'svelte'
62+
SVELTE = 'svelte',
63+
LATEX = 'latex',
64+
BIBTEX = 'bibtex'
6365
}
6466

6567
// File extension enums
@@ -115,7 +117,9 @@ export enum FileExtensionText {
115117
SWIFT = '.swift',
116118
DART = '.dart',
117119
VUE = '.vue',
118-
SVELTE = '.svelte'
120+
SVELTE = '.svelte',
121+
TEX = '.tex',
122+
BIB = '.bib'
119123
}
120124

121125
// MIME type enums
@@ -174,5 +178,7 @@ export enum MimeTypeText {
174178
SWIFT = 'text/x-swift',
175179
DART = 'text/x-dart',
176180
VUE = 'text/x-vue',
177-
SVELTE = 'text/x-svelte'
181+
SVELTE = 'text/x-svelte',
182+
LATEX = 'text/x-tex',
183+
BIBTEX = 'text/x-bibtex'
178184
}

tools/server/webui/src/lib/utils/text-files.ts

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
* Handles text file detection, reading, and validation
44
*/
55

6+
import {
7+
DEFAULT_BINARY_DETECTION_OPTIONS,
8+
type BinaryDetectionOptions
9+
} from '$lib/constants/binary-detection';
610
import { FileExtensionText } from '$lib/enums/files';
711

812
/**
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> {
4347
* Heuristic check to determine if content is likely from a text file
4448
* Detects binary files by counting suspicious characters and null bytes
4549
* @param content - The file content to analyze
50+
* @param options - Optional configuration for detection parameters
4651
* @returns True if the content appears to be text-based
4752
*/
48-
export function isLikelyTextFile(content: string): boolean {
53+
export function isLikelyTextFile(
54+
content: string,
55+
options: Partial<BinaryDetectionOptions> = {}
56+
): boolean {
4957
if (!content) return true;
5058

51-
const sample = content.substring(0, 1000);
59+
const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
60+
const sample = content.substring(0, config.prefixLength);
5261

53-
let suspiciousCount = 0;
5462
let nullCount = 0;
63+
let suspiciousControlCount = 0;
5564

5665
for (let i = 0; i < sample.length; i++) {
5766
const charCode = sample.charCodeAt(i);
5867

59-
// Count null bytes
68+
// Count null bytes - these are strong indicators of binary files
6069
if (charCode === 0) {
6170
nullCount++;
62-
suspiciousCount++;
6371

6472
continue;
6573
}
6674

67-
// Count suspicious control characters (excluding common ones like tab, newline, carriage return)
75+
// Count suspicious control characters
76+
// Allow common whitespace characters: tab (9), newline (10), carriage return (13)
6877
if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
69-
suspiciousCount++;
78+
// Count most suspicious control characters
79+
if (charCode < 8 || (charCode > 13 && charCode < 27)) {
80+
suspiciousControlCount++;
81+
}
7082
}
7183

7284
// Count replacement characters (indicates encoding issues)
7385
if (charCode === 0xfffd) {
74-
suspiciousCount++;
86+
suspiciousControlCount++;
7587
}
7688
}
7789

78-
// Reject if too many null bytes or suspicious characters
79-
if (nullCount > 2) return false;
80-
if (suspiciousCount / sample.length > 0.1) return false;
90+
// Reject if too many null bytes
91+
if (nullCount > config.maxAbsoluteNullBytes) return false;
92+
93+
// Reject if too many suspicious characters
94+
if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
8195

8296
return true;
8397
}

0 commit comments

Comments
 (0)