Skip to content

Commit c25cd48

Browse files
authored
perf: chunk trigger and paragraph split (#4893)
* perf: chunk trigger and paragraph split * update max size computed * perf: i18n * remove table
1 parent 874300a commit c25cd48

File tree

23 files changed

+860
-165
lines changed

23 files changed

+860
-165
lines changed

docSite/content/zh-cn/docs/development/upgrading/4910.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@ weight: 790
1111
## 🚀 新增内容
1212

1313
1. 支持 PG 设置`systemEnv.hnswMaxScanTuples`参数,提高迭代搜索的数据总量。
14-
2. 工作流调整为单向接入和接出,支持快速的添加下一步节点。
15-
3. 开放飞书和语雀知识库到开源版。
16-
4. gemini 和 claude 最新模型预设。
14+
2. 知识库预处理参数增加 “分块条件”,可控制某些情况下不进行分块处理。
15+
3. 知识库预处理参数增加 “段落优先” 模式,可控制最大段落深度。原“长度优先”模式,不再内嵌段落优先逻辑。
16+
4. 工作流调整为单向接入和接出,支持快速的添加下一步节点。
17+
5. 开放飞书和语雀知识库到开源版。
18+
6. gemini 和 claude 最新模型预设。
1719

1820
## ⚙️ 优化
1921

@@ -31,4 +33,5 @@ weight: 790
3133
3. 工具调用模式,未保存思考输出。
3234
4. 知识库 indexSize 参数未生效。
3335
5. 工作流嵌套 2 层后,获取预览引用、上下文不正确。
34-
6. xlsx 转成 Markdown 时候,前面会多出一个空格。
36+
6. xlsx 转成 Markdown 时候,前面会多出一个空格。
37+
7. 读取 Markdown 文件时,Base64 图片未进行额外抓换保存。

packages/global/common/string/textSplitter.ts

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ export const CUSTOM_SPLIT_SIGN = '-----CUSTOM_SPLIT_SIGN-----';
77
type SplitProps = {
88
text: string;
99
chunkSize: number;
10+
11+
paragraphChunkDeep?: number; // Paragraph deep
12+
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
13+
1014
maxSize?: number;
1115
overlapRatio?: number;
1216
customReg?: string[];
@@ -108,6 +112,8 @@ const commonSplit = (props: SplitProps): SplitResponse => {
108112
let {
109113
text = '',
110114
chunkSize,
115+
paragraphChunkDeep = 5,
116+
paragraphChunkMinSize = 100,
111117
maxSize = defaultMaxChunkSize,
112118
overlapRatio = 0.15,
113119
customReg = []
@@ -123,7 +129,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
123129
text = text.replace(/(```[\s\S]*?```|~~~[\s\S]*?~~~)/g, function (match) {
124130
return match.replace(/\n/g, codeBlockMarker);
125131
});
126-
// 2. 表格处理 - 单独提取表格出来,进行表头合并
132+
// 2. Markdown 表格处理 - 单独提取表格出来,进行表头合并
127133
const tableReg =
128134
/(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n?)*)(?:\n|$)/g;
129135
const tableDataList = text.match(tableReg);
@@ -143,25 +149,40 @@ const commonSplit = (props: SplitProps): SplitResponse => {
143149
text = text.replace(/(\r?\n|\r){3,}/g, '\n\n\n');
144150

145151
// The larger maxLen is, the next sentence is less likely to trigger splitting
146-
const markdownIndex = 4;
147-
const forbidOverlapIndex = 8;
152+
const customRegLen = customReg.length;
153+
const markdownIndex = paragraphChunkDeep - 1;
154+
const forbidOverlapIndex = customRegLen + markdownIndex + 4;
155+
156+
const markdownHeaderRules = ((deep?: number): { reg: RegExp; maxLen: number }[] => {
157+
if (!deep || deep === 0) return [];
158+
159+
const maxDeep = Math.min(deep, 8); // Maximum 8 levels
160+
const rules: { reg: RegExp; maxLen: number }[] = [];
161+
162+
for (let i = 1; i <= maxDeep; i++) {
163+
const hashSymbols = '#'.repeat(i);
164+
rules.push({
165+
reg: new RegExp(`^(${hashSymbols}\\s[^\\n]+\\n)`, 'gm'),
166+
maxLen: chunkSize
167+
});
168+
}
169+
170+
return rules;
171+
})(paragraphChunkDeep);
148172

149173
const stepReges: { reg: RegExp | string; maxLen: number }[] = [
150174
...customReg.map((text) => ({
151175
reg: text.replaceAll('\\n', '\n'),
152176
maxLen: chunkSize
153177
})),
154-
{ reg: /^(#\s[^\n]+\n)/gm, maxLen: chunkSize },
155-
{ reg: /^(##\s[^\n]+\n)/gm, maxLen: chunkSize },
156-
{ reg: /^(###\s[^\n]+\n)/gm, maxLen: chunkSize },
157-
{ reg: /^(####\s[^\n]+\n)/gm, maxLen: chunkSize },
158-
{ reg: /^(#####\s[^\n]+\n)/gm, maxLen: chunkSize },
178+
...markdownHeaderRules,
159179

160180
{ reg: /([\n](```[\s\S]*?```|~~~[\s\S]*?~~~))/g, maxLen: maxSize }, // code block
181+
// HTML Table tag 尽可能保障完整
161182
{
162183
reg: /(\n\|(?:(?:[^\n|]+\|){1,})\n\|(?:[:\-\s]+\|){1,}\n(?:\|(?:[^\n|]+\|)*\n)*)/g,
163-
maxLen: Math.min(chunkSize * 1.5, maxSize)
164-
}, // Table 尽可能保证完整性
184+
maxLen: chunkSize
185+
}, // Markdown Table 尽可能保证完整性
165186
{ reg: /(\n{2,})/g, maxLen: chunkSize },
166187
{ reg: /([\n])/g, maxLen: chunkSize },
167188
// ------ There's no overlap on the top
@@ -172,12 +193,10 @@ const commonSplit = (props: SplitProps): SplitResponse => {
172193
{ reg: /([]|,\s)/g, maxLen: chunkSize }
173194
];
174195

175-
const customRegLen = customReg.length;
176196
const checkIsCustomStep = (step: number) => step < customRegLen;
177197
const checkIsMarkdownSplit = (step: number) =>
178198
step >= customRegLen && step <= markdownIndex + customRegLen;
179-
180-
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex + customRegLen;
199+
const checkForbidOverlap = (step: number) => step <= forbidOverlapIndex;
181200

182201
// if use markdown title split, Separate record title
183202
const getSplitTexts = ({ text, step }: { text: string; step: number }) => {
@@ -301,6 +320,7 @@ const commonSplit = (props: SplitProps): SplitResponse => {
301320
const splitTexts = getSplitTexts({ text, step });
302321

303322
const chunks: string[] = [];
323+
304324
for (let i = 0; i < splitTexts.length; i++) {
305325
const item = splitTexts[i];
306326

@@ -443,7 +463,6 @@ const commonSplit = (props: SplitProps): SplitResponse => {
443463
*/
444464
export const splitText2Chunks = (props: SplitProps): SplitResponse => {
445465
let { text = '' } = props;
446-
const start = Date.now();
447466
const splitWithCustomSign = text.split(CUSTOM_SPLIT_SIGN);
448467

449468
const splitResult = splitWithCustomSign.map((item) => {

packages/global/core/dataset/training/utils.ts

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@ export const computeChunkSize = (params: {
120120

121121
return Math.min(params.chunkSize ?? chunkAutoChunkSize, getLLMMaxChunkSize(params.llmModel));
122122
};
123-
124123
export const computeChunkSplitter = (params: {
125124
chunkSettingMode?: ChunkSettingModeEnum;
126125
chunkSplitMode?: DataChunkSplitModeEnum;
@@ -129,8 +128,21 @@ export const computeChunkSplitter = (params: {
129128
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
130129
return undefined;
131130
}
132-
if (params.chunkSplitMode === DataChunkSplitModeEnum.size) {
131+
if (params.chunkSplitMode !== DataChunkSplitModeEnum.char) {
133132
return undefined;
134133
}
135134
return params.chunkSplitter;
136135
};
136+
export const computeParagraphChunkDeep = (params: {
137+
chunkSettingMode?: ChunkSettingModeEnum;
138+
chunkSplitMode?: DataChunkSplitModeEnum;
139+
paragraphChunkDeep?: number;
140+
}) => {
141+
if (params.chunkSettingMode === ChunkSettingModeEnum.auto) {
142+
return 5;
143+
}
144+
if (params.chunkSplitMode === DataChunkSplitModeEnum.paragraph) {
145+
return params.paragraphChunkDeep;
146+
}
147+
return 0;
148+
};

packages/global/core/dataset/type.d.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ import type {
99
DatasetTypeEnum,
1010
SearchScoreTypeEnum,
1111
TrainingModeEnum,
12-
ChunkSettingModeEnum
12+
ChunkSettingModeEnum,
13+
ChunkTriggerConfigTypeEnum
1314
} from './constants';
1415
import type { DatasetPermission } from '../../support/permission/dataset/controller';
1516
import type { APIFileServer, FeishuServer, YuqueServer } from './apiDataset';
@@ -37,11 +38,10 @@ export type ChunkSettingsType = {
3738
paragraphChunkAIMode?: ParagraphChunkAIModeEnum;
3839
paragraphChunkDeep?: number; // Paragraph deep
3940
paragraphChunkMinSize?: number; // Paragraph min size, if too small, it will merge
40-
paragraphChunkMaxSize?: number; // Paragraph max size, if too large, it will split
4141
// Size split
42-
chunkSize?: number;
42+
chunkSize?: number; // chunk/qa chunk size, Paragraph max chunk size.
4343
// Char split
44-
chunkSplitter?: string;
44+
chunkSplitter?: string; // chunk/qa chunk splitter
4545
indexSize?: number;
4646

4747
qaPrompt?: string;

packages/service/core/dataset/collection/controller.ts

Lines changed: 23 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import { getTrainingModeByCollection } from './utils';
3434
import {
3535
computeChunkSize,
3636
computeChunkSplitter,
37+
computeParagraphChunkDeep,
3738
getLLMMaxChunkSize
3839
} from '@fastgpt/global/core/dataset/training/utils';
3940
import { DatasetDataIndexTypeEnum } from '@fastgpt/global/core/dataset/data/constants';
@@ -74,6 +75,8 @@ export const createCollectionAndInsertData = async ({
7475
llmModel: getLLMModel(dataset.agentModel)
7576
});
7677
const chunkSplitter = computeChunkSplitter(createCollectionParams);
78+
const paragraphChunkDeep = computeParagraphChunkDeep(createCollectionParams);
79+
7780
if (trainingType === DatasetCollectionDataProcessModeEnum.qa) {
7881
delete createCollectionParams.chunkTriggerType;
7982
delete createCollectionParams.chunkTriggerMinSize;
@@ -87,7 +90,11 @@ export const createCollectionAndInsertData = async ({
8790
// 1. split chunks
8891
const chunks = rawText2Chunks({
8992
rawText,
93+
chunkTriggerType: createCollectionParams.chunkTriggerType,
94+
chunkTriggerMinSize: createCollectionParams.chunkTriggerMinSize,
9095
chunkSize,
96+
paragraphChunkDeep,
97+
paragraphChunkMinSize: createCollectionParams.paragraphChunkMinSize,
9198
maxSize: getLLMMaxChunkSize(getLLMModel(dataset.agentModel)),
9299
overlapRatio: trainingType === DatasetCollectionDataProcessModeEnum.chunk ? 0.2 : 0,
93100
customReg: chunkSplitter ? [chunkSplitter] : [],
@@ -112,6 +119,7 @@ export const createCollectionAndInsertData = async ({
112119
const { _id: collectionId } = await createOneCollection({
113120
...createCollectionParams,
114121
trainingType,
122+
paragraphChunkDeep,
115123
chunkSize,
116124
chunkSplitter,
117125

@@ -212,88 +220,38 @@ export type CreateOneCollectionParams = CreateDatasetCollectionParams & {
212220
tmbId: string;
213221
session?: ClientSession;
214222
};
215-
export async function createOneCollection({
216-
teamId,
217-
tmbId,
218-
name,
219-
parentId,
220-
datasetId,
221-
type,
222-
223-
createTime,
224-
updateTime,
225-
226-
hashRawText,
227-
rawTextLength,
228-
metadata = {},
229-
tags,
230-
231-
nextSyncTime,
232-
233-
fileId,
234-
rawLink,
235-
externalFileId,
236-
externalFileUrl,
237-
apiFileId,
238-
239-
// Parse settings
240-
customPdfParse,
241-
imageIndex,
242-
autoIndexes,
243-
244-
// Chunk settings
245-
trainingType,
246-
chunkSettingMode,
247-
chunkSplitMode,
248-
chunkSize,
249-
indexSize,
250-
chunkSplitter,
251-
qaPrompt,
252-
253-
session
254-
}: CreateOneCollectionParams) {
223+
export async function createOneCollection({ session, ...props }: CreateOneCollectionParams) {
224+
const {
225+
teamId,
226+
parentId,
227+
datasetId,
228+
tags,
229+
230+
fileId,
231+
rawLink,
232+
externalFileId,
233+
externalFileUrl,
234+
apiFileId
235+
} = props;
255236
// Create collection tags
256237
const collectionTags = await createOrGetCollectionTags({ tags, teamId, datasetId, session });
257238

258239
// Create collection
259240
const [collection] = await MongoDatasetCollection.create(
260241
[
261242
{
243+
...props,
262244
teamId,
263-
tmbId,
264245
parentId: parentId || null,
265246
datasetId,
266-
name,
267-
type,
268247

269-
rawTextLength,
270-
hashRawText,
271248
tags: collectionTags,
272-
metadata,
273-
274-
createTime,
275-
updateTime,
276-
nextSyncTime,
277249

278250
...(fileId ? { fileId } : {}),
279251
...(rawLink ? { rawLink } : {}),
280252
...(externalFileId ? { externalFileId } : {}),
281253
...(externalFileUrl ? { externalFileUrl } : {}),
282-
...(apiFileId ? { apiFileId } : {}),
283-
284-
// Parse settings
285-
customPdfParse,
286-
imageIndex,
287-
autoIndexes,
288-
289-
// Chunk settings
290-
trainingType,
291-
chunkSettingMode,
292-
chunkSplitMode,
293-
chunkSize,
294-
indexSize,
295-
chunkSplitter,
296-
qaPrompt
254+
...(apiFileId ? { apiFileId } : {})
297255
}
298256
],
299257
{ session, ordered: true }

packages/service/core/dataset/read.ts

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
import { BucketNameEnum } from '@fastgpt/global/common/file/constants';
2-
import { DatasetSourceReadTypeEnum } from '@fastgpt/global/core/dataset/constants';
2+
import {
3+
ChunkTriggerConfigTypeEnum,
4+
DatasetSourceReadTypeEnum
5+
} from '@fastgpt/global/core/dataset/constants';
36
import { readFileContentFromMongo } from '../../common/file/gridfs/controller';
47
import { urlsFetch } from '../../common/string/cheerio';
58
import { type TextSplitProps, splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
@@ -179,11 +182,17 @@ export const readApiServerFileContent = async ({
179182

180183
export const rawText2Chunks = ({
181184
rawText,
185+
chunkTriggerType = ChunkTriggerConfigTypeEnum.minSize,
186+
chunkTriggerMinSize = 1000,
182187
backupParse,
183188
chunkSize = 512,
184189
...splitProps
185190
}: {
186191
rawText: string;
192+
193+
chunkTriggerType?: ChunkTriggerConfigTypeEnum;
194+
chunkTriggerMinSize?: number; // maxSize from agent model, not store
195+
187196
backupParse?: boolean;
188197
tableParse?: boolean;
189198
} & TextSplitProps): {
@@ -209,6 +218,28 @@ export const rawText2Chunks = ({
209218
};
210219
};
211220

221+
// Chunk condition
222+
// 1. 选择最大值条件,只有超过了最大值(默认为模型的最大值*0.7),才会触发分块
223+
if (chunkTriggerType === ChunkTriggerConfigTypeEnum.maxSize) {
224+
const textLength = rawText.trim().length;
225+
const maxSize = splitProps.maxSize ? splitProps.maxSize * 0.7 : 16000;
226+
if (textLength < maxSize) {
227+
return [
228+
{
229+
q: rawText,
230+
a: ''
231+
}
232+
];
233+
}
234+
}
235+
// 2. 选择最小值条件,只有超过最小值(手动决定)才会触发分块
236+
if (chunkTriggerType !== ChunkTriggerConfigTypeEnum.forceChunk) {
237+
const textLength = rawText.trim().length;
238+
if (textLength < chunkTriggerMinSize) {
239+
return [{ q: rawText, a: '' }];
240+
}
241+
}
242+
212243
if (backupParse) {
213244
return parseDatasetBackup2Chunks(rawText).chunks;
214245
}

packages/service/core/dataset/schema.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ export const ChunkSettings = {
4747
},
4848
paragraphChunkDeep: Number,
4949
paragraphChunkMinSize: Number,
50-
paragraphChunkMaxSize: Number,
5150
chunkSize: Number,
5251
chunkSplitter: String,
5352

0 commit comments

Comments
 (0)