Skip to content

Commit b58249f

Browse files
authored
4.6.4-alpha (#582)
1 parent 54d52d8 commit b58249f

File tree

66 files changed

+964
-529
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+964
-529
lines changed

files/deploy/fastgpt/docker-compose.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# 非 host 版本, 不使用本机代理
2+
# (不懂 Docker 的,只需要关心 OPENAI_BASE_URL 和 CHAT_API_KEY 即可!)
23
version: '3.3'
34
services:
45
pg:
@@ -47,7 +48,7 @@ services:
4748
environment:
4849
# root 密码,用户名为: root
4950
- DEFAULT_ROOT_PSW=1234
50-
# 中转地址,如果是用官方号,不需要管
51+
# 中转地址,如果是用官方号,不需要管。务必加 /v1
5152
- OPENAI_BASE_URL=https://api.openai.com/v1
5253
- CHAT_API_KEY=sk-xxxx
5354
- DB_MAX_LINK=5 # database max link
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import { ErrType } from '../errorCode';
2+
3+
/* dataset: 507000 */
4+
const startCode = 507000;
5+
export enum CommonErrEnum {
6+
fileNotFound = 'fileNotFound'
7+
}
8+
const datasetErr = [
9+
{
10+
statusText: CommonErrEnum.fileNotFound,
11+
message: 'error.fileNotFound'
12+
}
13+
];
14+
export default datasetErr.reduce((acc, cur, index) => {
15+
return {
16+
...acc,
17+
[cur.statusText]: {
18+
code: startCode + index,
19+
statusText: cur.statusText,
20+
message: cur.message,
21+
data: null
22+
}
23+
};
24+
}, {} as ErrType<`${CommonErrEnum}`>);

packages/global/common/error/code/dataset.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,23 +13,23 @@ export enum DatasetErrEnum {
1313
const datasetErr = [
1414
{
1515
statusText: DatasetErrEnum.unAuthDataset,
16-
message: '无权操作该知识库'
16+
message: 'core.dataset.error.unAuthDataset'
1717
},
1818
{
1919
statusText: DatasetErrEnum.unAuthDatasetCollection,
20-
message: '无权操作该数据集'
20+
message: 'core.dataset.error.unAuthDatasetCollection'
2121
},
2222
{
2323
statusText: DatasetErrEnum.unAuthDatasetData,
24-
message: '无权操作该数据'
24+
message: 'core.dataset.error.unAuthDatasetData'
2525
},
2626
{
2727
statusText: DatasetErrEnum.unAuthDatasetFile,
28-
message: '无权操作该文件'
28+
message: 'core.dataset.error.unAuthDatasetFile'
2929
},
3030
{
3131
statusText: DatasetErrEnum.unCreateCollection,
32-
message: '无权创建数据集'
32+
message: 'core.dataset.error.unCreateCollection'
3333
},
3434
{
3535
statusText: DatasetErrEnum.unLinkCollection,

packages/global/common/error/errorCode.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import pluginErr from './code/plugin';
66
import outLinkErr from './code/outLink';
77
import teamErr from './code/team';
88
import userErr from './code/user';
9+
import commonErr from './code/common';
910

1011
export const ERROR_CODE: { [key: number]: string } = {
1112
400: '请求失败',
@@ -96,5 +97,6 @@ export const ERROR_RESPONSE: Record<
9697
...outLinkErr,
9798
...teamErr,
9899
...userErr,
99-
...pluginErr
100+
...pluginErr,
101+
...commonErr
100102
};

packages/global/common/file/api.d.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
export type UploadImgProps = {
2+
base64Img: string;
3+
expiredTime?: Date;
4+
metadata?: Record<string, any>;
5+
shareId?: string;
6+
};
7+
18
export type UrlFetchParams = {
29
urlList: string[];
310
selector?: string;

packages/global/common/file/tools.ts

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,14 @@ export const cheerioToHtml = ({
4949
}
5050
});
5151

52-
return $(selector || 'body').html();
52+
const html = $(selector || 'body')
53+
.map((item, dom) => {
54+
return $(dom).html();
55+
})
56+
.get()
57+
.join('\n');
58+
59+
return html;
5360
};
5461
export const urlsFetch = async ({
5562
urlList,

packages/global/common/string/markdown.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,14 @@ export const simpleMarkdownText = (rawText: string) => {
2626
rawText = rawText.replace(/\\\\n/g, '\\n');
2727

2828
// Remove headings and code blocks front spaces
29-
['####', '###', '##', '#', '```', '~~~'].forEach((item) => {
29+
['####', '###', '##', '#', '```', '~~~'].forEach((item, i) => {
30+
const isMarkdown = i <= 3;
3031
const reg = new RegExp(`\\n\\s*${item}`, 'g');
3132
if (reg.test(rawText)) {
32-
rawText = rawText.replace(new RegExp(`\\n\\s*(${item})`, 'g'), '\n$1');
33+
rawText = rawText.replace(
34+
new RegExp(`(\\n)\\s*(${item})`, 'g'),
35+
isMarkdown ? '\n$1$2' : '$1$2'
36+
);
3337
}
3438
});
3539

packages/global/common/string/textSplitter.ts

Lines changed: 41 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,13 @@ export const splitText2Chunks = (props: {
1212
text: string;
1313
chunkLen: number;
1414
overlapRatio?: number;
15+
customReg?: string[];
1516
}): {
1617
chunks: string[];
1718
tokens: number;
1819
overlapRatio?: number;
1920
} => {
20-
let { text = '', chunkLen, overlapRatio = 0.2 } = props;
21+
let { text = '', chunkLen, overlapRatio = 0.2, customReg = [] } = props;
2122
const splitMarker = 'SPLIT_HERE_SPLIT_HERE';
2223
const codeBlockMarker = 'CODE_BLOCK_LINE_MARKER';
2324
const overlapLen = Math.round(chunkLen * overlapRatio);
@@ -29,22 +30,29 @@ export const splitText2Chunks = (props: {
2930

3031
// The larger maxLen is, the next sentence is less likely to trigger splitting
3132
const stepReges: { reg: RegExp; maxLen: number }[] = [
32-
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
33-
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
34-
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
35-
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.4 },
36-
37-
{ reg: /([\n](`))/g, maxLen: chunkLen * 4 }, // code block
38-
{ reg: /([\n](?![\*\-|>0-9]))/g, maxLen: chunkLen * 1.8 }, // (?![\*\-|>`0-9]): markdown special char
39-
{ reg: /([\n])/g, maxLen: chunkLen * 1.4 },
40-
41-
{ reg: /([]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.4 },
42-
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.4 },
43-
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.6 },
44-
{ reg: /([]|;\s)/g, maxLen: chunkLen * 1.8 },
33+
...customReg.map((text) => ({ reg: new RegExp(`([${text}])`, 'g'), maxLen: chunkLen * 1.4 })),
34+
{ reg: /^(#\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
35+
{ reg: /^(##\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
36+
{ reg: /^(###\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
37+
{ reg: /^(####\s[^\n]+)\n/gm, maxLen: chunkLen * 1.2 },
38+
39+
{ reg: /([\n]([`~]))/g, maxLen: chunkLen * 4 }, // code block
40+
{ reg: /([\n](?!\s*[\*\-|>0-9]))/g, maxLen: chunkLen * 2 }, // (?![\*\-|>`0-9]): markdown special char
41+
{ reg: /([\n])/g, maxLen: chunkLen * 1.2 },
42+
43+
{ reg: /([]|([a-zA-Z])\.\s)/g, maxLen: chunkLen * 1.2 },
44+
{ reg: /([]|!\s)/g, maxLen: chunkLen * 1.2 },
45+
{ reg: /([]|\?\s)/g, maxLen: chunkLen * 1.4 },
46+
{ reg: /([]|;\s)/g, maxLen: chunkLen * 1.6 },
4547
{ reg: /([]|,\s)/g, maxLen: chunkLen * 2 }
4648
];
4749

50+
const customRegLen = customReg.length;
51+
const checkIsCustomStep = (step: number) => step < customRegLen;
52+
const checkIsMarkdownSplit = (step: number) => step >= customRegLen && step <= 3 + customRegLen;
53+
const checkIndependentChunk = (step: number) => step >= customRegLen && step <= 4 + customRegLen;
54+
const checkForbidOverlap = (step: number) => step <= 6 + customRegLen;
55+
4856
// if use markdown title split, Separate record title title
4957
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
5058
if (step >= stepReges.length) {
@@ -55,11 +63,13 @@ export const splitText2Chunks = (props: {
5563
}
5664
];
5765
}
58-
const isMarkdownSplit = step <= 3;
66+
const isMarkdownSplit = checkIsMarkdownSplit(step);
67+
const independentChunk = checkIndependentChunk(step);
68+
5969
const { reg } = stepReges[step];
6070

6171
const splitTexts = text
62-
.replace(reg, isMarkdownSplit ? `${splitMarker}$1` : `$1${splitMarker}`)
72+
.replace(reg, independentChunk ? `${splitMarker}$1` : `$1${splitMarker}`)
6373
.split(`${splitMarker}`)
6474
.filter((part) => part.trim());
6575

@@ -76,7 +86,7 @@ export const splitText2Chunks = (props: {
7686
};
7787

7888
const getOneTextOverlapText = ({ text, step }: { text: string; step: number }): string => {
79-
const forbidOverlap = step <= 6;
89+
const forbidOverlap = checkForbidOverlap(step);
8090
const maxOverlapLen = chunkLen * 0.4;
8191

8292
// step >= stepReges.length: Do not overlap incomplete sentences
@@ -114,7 +124,8 @@ export const splitText2Chunks = (props: {
114124
lastText: string;
115125
mdTitle: string;
116126
}): string[] => {
117-
const isMarkdownSplit = step <= 3;
127+
const independentChunk = checkIndependentChunk(step);
128+
const isCustomStep = checkIsCustomStep(step);
118129

119130
// mini text
120131
if (text.length <= chunkLen) {
@@ -134,12 +145,13 @@ export const splitText2Chunks = (props: {
134145
return chunks;
135146
}
136147

137-
const { maxLen } = stepReges[step];
138-
const minChunkLen = chunkLen * 0.7;
139-
140148
// split text by special char
141149
const splitTexts = getSplitTexts({ text, step });
142150

151+
const maxLen = splitTexts.length > 1 ? stepReges[step].maxLen : chunkLen;
152+
const minChunkLen = chunkLen * 0.7;
153+
const miniChunkLen = 30;
154+
143155
const chunks: string[] = [];
144156
for (let i = 0; i < splitTexts.length; i++) {
145157
const item = splitTexts[i];
@@ -170,8 +182,8 @@ export const splitText2Chunks = (props: {
170182
mdTitle: currentTitle
171183
});
172184
const lastChunk = innerChunks[innerChunks.length - 1];
173-
// last chunk is too small, concat it to lastText
174-
if (!isMarkdownSplit && lastChunk.length < minChunkLen) {
185+
// last chunk is too small, concat it to lastText(next chunk start)
186+
if (!independentChunk && lastChunk.length < minChunkLen) {
175187
chunks.push(...innerChunks.slice(0, -1));
176188
lastText = lastChunk;
177189
} else {
@@ -189,10 +201,14 @@ export const splitText2Chunks = (props: {
189201
lastText = newText;
190202

191203
// markdown paragraph block: Direct addition; If the chunk size reaches, add a chunk
192-
if (isMarkdownSplit || newTextLen >= chunkLen) {
204+
if (
205+
isCustomStep ||
206+
(independentChunk && newTextLen > miniChunkLen) ||
207+
newTextLen >= chunkLen
208+
) {
193209
chunks.push(`${currentTitle}${lastText}`);
194210

195-
lastText = isMarkdownSplit ? '' : getOneTextOverlapText({ text: lastText, step });
211+
lastText = getOneTextOverlapText({ text: lastText, step });
196212
}
197213
}
198214

packages/global/core/app/utils.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ export const getDefaultAppForm = (templateId = 'fastgpt-universal'): AppSimpleEd
2424
dataset: {
2525
datasets: [],
2626
similarity: 0.4,
27-
limit: 5,
27+
limit: 1500,
2828
searchEmptyText: '',
2929
searchMode: DatasetSearchModeEnum.embedding
3030
},

packages/global/core/chat/constants.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,5 @@ export const LOGO_ICON = `/icon/logo.svg`;
5555

5656
export const IMG_BLOCK_KEY = 'img-block';
5757
export const FILE_BLOCK_KEY = 'file-block';
58+
59+
export const MARKDOWN_QUOTE_SIGN = 'QUOTE SIGN';

packages/global/core/module/template/system/datasetSearch.ts

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,10 @@ export const DatasetSearchModule: FlowModuleTemplateType = {
5454
{
5555
key: ModuleInputKeyEnum.datasetLimit,
5656
type: FlowNodeInputTypeEnum.hidden,
57-
label: '单次搜索上限',
58-
description: '最多取 n 条记录作为本次问题引用',
59-
value: 5,
57+
label: '引用上限',
58+
description: '单次搜索最大的 Tokens 数量,中文约1字=1.7Tokens,英文约1字=1Tokens',
59+
value: 1500,
6060
valueType: ModuleDataTypeEnum.number,
61-
min: 1,
62-
max: 20,
63-
step: 1,
64-
markList: [
65-
{ label: '1', value: 1 },
66-
{ label: '20', value: 20 }
67-
],
6861
showTargetInApp: false,
6962
showTargetInPlugin: false
7063
},

packages/service/common/file/gridfs/controller.ts

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
33
import fsp from 'fs/promises';
44
import fs from 'fs';
55
import { DatasetFileSchema } from '@fastgpt/global/core/dataset/type';
6+
import { delImgByFileIdList } from '../image/controller';
67

78
export function getGFSCollection(bucket: `${BucketNameEnum}`) {
89
return connectionMongo.connection.db.collection(`${bucket}.files`);
@@ -69,24 +70,65 @@ export async function getFileById({
6970
_id: new Types.ObjectId(fileId)
7071
});
7172

72-
if (!file) {
73-
return Promise.reject('File not found');
74-
}
73+
// if (!file) {
74+
// return Promise.reject('File not found');
75+
// }
7576

76-
return file;
77+
return file || undefined;
7778
}
7879

79-
export async function delFileById({
80+
export async function delFileByFileIdList({
8081
bucketName,
81-
fileId
82+
fileIdList,
83+
retry = 3
8284
}: {
8385
bucketName: `${BucketNameEnum}`;
84-
fileId: string;
86+
fileIdList: string[];
87+
retry?: number;
88+
}): Promise<any> {
89+
try {
90+
const bucket = getGridBucket(bucketName);
91+
92+
await Promise.all(fileIdList.map((id) => bucket.delete(new Types.ObjectId(id))));
93+
} catch (error) {
94+
if (retry > 0) {
95+
return delFileByFileIdList({ bucketName, fileIdList, retry: retry - 1 });
96+
}
97+
}
98+
}
99+
// delete file by metadata(datasetId)
100+
export async function delFileByMetadata({
101+
bucketName,
102+
datasetId
103+
}: {
104+
bucketName: `${BucketNameEnum}`;
105+
datasetId?: string;
85106
}) {
86107
const bucket = getGridBucket(bucketName);
87108

88-
await bucket.delete(new Types.ObjectId(fileId));
89-
return true;
109+
const files = await bucket
110+
.find(
111+
{
112+
...(datasetId && { 'metadata.datasetId': datasetId })
113+
},
114+
{
115+
projection: {
116+
_id: 1
117+
}
118+
}
119+
)
120+
.toArray();
121+
122+
const idList = files.map((item) => String(item._id));
123+
124+
// delete img
125+
await delImgByFileIdList(idList);
126+
127+
// delete file
128+
await delFileByFileIdList({
129+
bucketName,
130+
fileIdList: idList
131+
});
90132
}
91133

92134
export async function getDownloadStream({

0 commit comments

Comments
 (0)