Skip to content

Commit

Permalink
feat: support upload document (#136)
Browse files Browse the repository at this point in the history
* support upload file

* fix

* fix accept

* fix

* fix pdfjs

* fix pdfjs

* fix pdf loader
  • Loading branch information
634750802 authored May 16, 2024
1 parent 9eebb0d commit 490551d
Show file tree
Hide file tree
Showing 11 changed files with 193 additions and 99 deletions.
12 changes: 8 additions & 4 deletions extensions/pdf-loader/PdfLoader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,28 @@ import pdfLoaderMeta, { type PdfLoaderOptions } from './meta';

export default class PdfLoader extends rag.Loader<PdfLoaderOptions, {}> {
async load (buffer: Buffer) {
const Pdf = await import('pdfjs-dist');
const Pdf = await import('pdfjs-dist/build/pdf.mjs');
await import('pdfjs-dist/build/pdf.worker.mjs');

const document = await Pdf.getDocument(buffer.buffer).promise;
const content: string[] = [];
let content: string = '';

for (let i = 1; i <= document.numPages; i++) {
const page = await document.getPage(i);
const textContent = await page.getTextContent();

for (let item of textContent.items) {
if ('str' in item) {
content.push(item.str);
content += item.str;
if (item.hasEOL) {
content += '\n';
}
}
}
}

return {
content,
content: [content],
hash: md5(buffer),
metadata: {},
};
Expand Down
Original file line number Diff line number Diff line change
@@ -1,40 +1,50 @@
import {
ImportDocumentsFromUrlsOptions,
ImportDocumentsFromUrlsOptionsSchema
} from "@/app/api/v1/documents/import/from/urls/schema";
import {importDocuments} from '@/client/operations/documents';
import {Alert, AlertDescription, AlertTitle} from "@/components/ui/alert";
import {Button} from "@/components/ui/button";
import {
Dialog,
DialogContent,
DialogDescription,
DialogFooter,
DialogHeader,
DialogTrigger
} from "@/components/ui/dialog";
import {Form, FormField, FormItem, FormMessage} from "@/components/ui/form";
import {Textarea} from "@/components/ui/textarea";
import {getErrorMessage} from "@/lib/errors";
import {zodResolver} from "@hookform/resolvers/zod";
import {AlertTriangleIcon} from "lucide-react";
import {ReactNode, useState} from "react";
import {useForm} from "react-hook-form";
import { ImportDocumentsFromUrlsOptions, ImportDocumentsFromUrlsOptionsSchema } from '@/app/api/v1/documents/import/from/urls/schema';
import { importDocumentFromFile, importDocumentsFromUrls } from '@/client/operations/documents';
import { Alert, AlertDescription, AlertTitle } from '@/components/ui/alert';
import { Button } from '@/components/ui/button';
import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTrigger } from '@/components/ui/dialog';
import { Form, FormField, FormItem, FormMessage } from '@/components/ui/form';
import { Input } from '@/components/ui/input';
import { Tabs, TabsList, TabsTrigger } from '@/components/ui/tabs';
import { Textarea } from '@/components/ui/textarea';
import { getErrorMessage } from '@/lib/errors';
import { zodResolver } from '@hookform/resolvers/zod';
import { AlertTriangleIcon } from 'lucide-react';
import { ReactNode, useEffect, useMemo, useState } from 'react';
import { useForm, UseFormReturn } from 'react-hook-form';
import z, { type ZodType } from 'zod';

export interface ImportDocumentsDialogProps {
trigger: ReactNode;
}

type ImportMethod<T extends {}> = {
type: string
schema: ZodType<T>
title: string
description: ReactNode
form: (form: UseFormReturn<T>) => ReactNode
handler: (value: T) => Promise<void>
}

export function ImportDocumentsDialog (props: ImportDocumentsDialogProps) {
const { trigger } = props;
const [open, setOpen] = useState(false);
const [methodType, setMethodType] = useState(importMethods[0].type);
const method: ImportMethod<any> = useMemo(() => {
return importMethods.find(method => method.type === methodType)!;
}, [methodType]);

// Form instance.
const form = useForm<ImportDocumentsFromUrlsOptions>({
const form = useForm<any>({
defaultValues: {},
resolver: zodResolver(ImportDocumentsFromUrlsOptionsSchema),
resolver: zodResolver(method.schema),
});

useEffect(() => {
form.reset();
}, [method]);

// UI state.
const [loading, setLoading] = useState(false);
const [error, setError] = useState<unknown>();
Expand All @@ -43,13 +53,10 @@ export function ImportDocumentsDialog (props: ImportDocumentsDialogProps) {
const handleSubmit = form.handleSubmit(async (value) => {
try {
setLoading(true);
await importDocuments({
...value,
urls: value.urls.map(url => url.trim()).filter(Boolean),
});
await method.handler(value);
setOpen(false);
} catch (e) {
console.log(e)
console.log(e);
setError(e);
} finally {
setLoading(false);
Expand All @@ -63,27 +70,25 @@ export function ImportDocumentsDialog (props: ImportDocumentsDialogProps) {
}
<DialogContent className="max-h-[80vh] overflow-x-hidden overflow-y-auto">
<DialogHeader>Import documents</DialogHeader>
<DialogDescription>Import documents from urls (one URL per line)</DialogDescription>
<Tabs value={methodType} onValueChange={setMethodType}>
<TabsList>
{importMethods.map((method) => (
<TabsTrigger key={method.type} value={method.type}>{method.title}</TabsTrigger>
))}
</TabsList>
</Tabs>
<DialogDescription>
{method.description}
</DialogDescription>
<Form {...form}>
<form id="import-document-form" className="space-y-4" onSubmit={handleSubmit}>
<FormField
control={form.control}
name="urls"
render={({ field }) => {
return <FormItem>
<Textarea {...field} value={(field.value ?? []).join('\n')} onChange={(e) => {
return field.onChange((e.target.value ?? '').split('\n'))
}} />
<FormMessage />
</FormItem>
}}
/>
{method.form(form)}
</form>
</Form>

{!!error && (
<Alert variant="destructive">
<AlertTriangleIcon className="h-4 w-4"/>
<AlertTriangleIcon className="h-4 w-4" />
<AlertTitle>
Failed to operate
</AlertTitle>
Expand All @@ -100,3 +105,62 @@ export function ImportDocumentsDialog (props: ImportDocumentsDialogProps) {
</Dialog>
);
}

const importMethods = [
{
type: 'urls',
schema: ImportDocumentsFromUrlsOptionsSchema,
title: 'Import from URLs',
description: 'Import documents from urls (one URL per line)',
handler: async value => {
await importDocumentsFromUrls({
...value,
urls: value.urls.map(url => url.trim()).filter(Boolean),
});
},
form: form => (
<FormField
control={form.control}
name="urls"
render={({ field }) => {
return <FormItem>
<Textarea {...field} value={(field.value ?? []).join('\n')} onChange={(e) => {
return field.onChange((e.target.value ?? '').split('\n'));
}} />
<FormMessage />
</FormItem>;
}}
/>
),
} satisfies ImportMethod<ImportDocumentsFromUrlsOptions>,
{
type: 'content',
schema: z.object({
file: z.instanceof(File, { message: 'Select a file' }),
}),
title: 'Upload',
description: 'Upload txt, Markdown, HTML or PDF',
handler: async value => importDocumentFromFile(value),
form: form => (
<FormField
control={form.control}
name="file"
render={({ field }) => {
return <FormItem>
<Input
type="file"
accept="text/plain, text/html, text/markdown, application/pdf"
{...field}
value={undefined}
onChange={event => {
field.onChange(event.target.files?.[0]);
}}
/>
<FormMessage />
</FormItem>;
}}
/>
),
} satisfies ImportMethod<{ file: File }>,
];

1 change: 1 addition & 0 deletions src/app/(main)/(admin)/index-tasks/page.client.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ const datetime = (cell: CellContext<any, any>) => <time>{cell.getValue() ? forma
const helper = createColumnHelper<DocumentIndexTask>();
const columns = [
helper.accessor('id', { header: 'Task ID' }),
helper.accessor('index_id', { header: 'Index ID' }),
helper.accessor('status', { cell: taskStatusCell }),
helper.accessor('type', {}),
helper.accessor('document_id', {}),
Expand Down
7 changes: 7 additions & 0 deletions src/app/(main)/(admin)/index-tasks/page.env.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

declare module 'pdfjs-dist/build/pdf.mjs' {
export * from 'pdfjs-dist'
}

declare module 'pdfjs-dist/build/pdf.worker.mjs' {
}
9 changes: 9 additions & 0 deletions src/app/api/test/pdf-loader/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { type NextRequest, NextResponse } from 'next/server';
import PdfLoader from '../../../../../extensions/pdf-loader/PdfLoader';

export async function POST (req: NextRequest) {
const pdfLoader = new PdfLoader({});
const form = await req.formData();
const file = form.get('file') as File;
return NextResponse.json(await pdfLoader.load(Buffer.from(await file.arrayBuffer())));
}
30 changes: 30 additions & 0 deletions src/app/api/v1/documents/import/from/file/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import { DefaultDocumentImportService, DocumentImportService } from '@/core/services/importing';
import { defineHandler } from '@/lib/next/handler';
import { baseRegistry } from '@/rag-spec/base';
import { getFlow } from '@/rag-spec/createFlow';

export const POST = defineHandler({
auth: 'admin',
}, async ({ request }) => {
const form = await request.formData();
const file = form.get('file');
if (!file || typeof file !== 'object') {
throw new Error('file needed');
}

const service = new DefaultDocumentImportService({ flow: await getFlow(baseRegistry) });

const flow = await getFlow(baseRegistry);
const storage = flow.getStorage();

const uri = await storage.put(`uploads/${file.name}`, Buffer.from(await file.arrayBuffer()), false);

const taskIds = await DocumentImportService.createTasksByURLs([uri], 'file');
console.log('Create document import tasks: ', taskIds);

return await service.runTasks(10, taskIds);
});

export const dynamic = 'force-dynamic';

export const maxDuration = 150;
5 changes: 3 additions & 2 deletions src/app/api/v1/documents/import/from/urls/schema.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {z} from "zod";
import { z } from 'zod';

export const ImportDocumentsFromUrlsOptionsSchema = z.object({
urls: z.string()
Expand All @@ -7,4 +7,5 @@ export const ImportDocumentsFromUrlsOptionsSchema = z.object({
.min(1, 'Must provide at least one URL for importing.')
});

export type ImportDocumentsFromUrlsOptions = z.infer<typeof ImportDocumentsFromUrlsOptionsSchema>;

export type ImportDocumentsFromUrlsOptions = z.infer<typeof ImportDocumentsFromUrlsOptionsSchema>;
31 changes: 21 additions & 10 deletions src/client/operations/documents.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import {ImportDocumentsFromUrlsOptions} from "@/app/api/v1/documents/import/from/urls/schema";
import {BuildDocumentIndexOptions} from "@/app/api/v1/documents/index/schema";
import { ImportDocumentsFromUrlsOptions } from '@/app/api/v1/documents/import/from/urls/schema';
import { BuildDocumentIndexOptions } from '@/app/api/v1/documents/index/schema';
import { handleErrors } from '@/lib/fetch';
import { withToast } from '@/lib/toast';

Expand All @@ -16,15 +16,14 @@ export const importWebsite = withToast(
'Content-Type': 'text/uri-list',
},
}).then(handleErrors);
}
},
);

export type UploadFileFormValues = {
file: File;
sourceUri: string;
};

export const uploadFile = withToast(
withToast(
async ({ file, sourceUri }: UploadFileFormValues) => {
const formData = new FormData();
formData.set('file', file);
Expand All @@ -34,7 +33,7 @@ export const uploadFile = withToast(
method: 'put',
body: formData,
}).then(handleErrors);
}
},
);

export const buildDocumentIndex = withToast(
Expand All @@ -49,20 +48,32 @@ export const buildDocumentIndex = withToast(
indexName,
}),
}).then(handleErrors);
}
},
);

export const importDocuments = withToast(
export const importDocumentsFromUrls = withToast(
async ({ urls }: ImportDocumentsFromUrlsOptions) => {
await fetch('/api/v1/documents/import/from/urls', {
method: 'post',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
urls
urls,
}),
}).then(handleErrors);
}
},
);

export const importDocumentFromFile = withToast(
async ({ file }: { file: File }) => {
const formData = new FormData();
formData.set('file', file);

await fetch('/api/v1/documents/import/from/file', {
method: 'post',
body: formData,
}).then(handleErrors);
},
);

Loading

0 comments on commit 490551d

Please sign in to comment.