From 118f097c4e85015e98b948d78a9f92165ddd7381 Mon Sep 17 00:00:00 2001 From: Dan Lew Date: Wed, 21 Jan 2026 11:06:40 -0600 Subject: [PATCH 1/9] feat: Added "document parser" OCR strategy The document parser uses libraries to parse the text out of known document types. This lets LibreChat handle some complex document types without having to use a secondary service (like Mistral or standing up a RAG API server). To enable the document parser, set the ocr strategy to "document_parser" in librechat.yaml. We now support: - PDFs using pdfjs - DOCX using mammoth - XLS/XLSX using SheetJS (The associated packages were also added to the project.) --- api/package.json | 3 + .../Documents/__tests__/documents.spec.js | 62 +++ .../Files/Documents/__tests__/empty.docx | Bin 0 -> 6514 bytes .../Files/Documents/__tests__/sample.docx | Bin 0 -> 6553 bytes .../Files/Documents/__tests__/sample.xlsx | Bin 0 -> 6111 bytes api/server/services/Files/Documents/crud.js | 99 ++++ api/server/services/Files/strategies.js | 23 + package-lock.json | 423 +++++++++++++++++- packages/data-provider/src/config.ts | 1 + packages/data-provider/src/types/files.ts | 1 + 10 files changed, 606 insertions(+), 6 deletions(-) create mode 100644 api/server/services/Files/Documents/__tests__/documents.spec.js create mode 100644 api/server/services/Files/Documents/__tests__/empty.docx create mode 100644 api/server/services/Files/Documents/__tests__/sample.docx create mode 100644 api/server/services/Files/Documents/__tests__/sample.xlsx create mode 100644 api/server/services/Files/Documents/crud.js diff --git a/api/package.json b/api/package.json index 4542e25745aa..794457921a66 100644 --- a/api/package.json +++ b/api/package.json @@ -80,6 +80,7 @@ "klona": "^2.0.6", "librechat-data-provider": "*", "lodash": "^4.17.23", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "meilisearch": "^0.38.0", "memorystore": "^1.6.7", @@ -102,6 +103,7 @@ "passport-jwt": "^4.0.1", "passport-ldapauth": "^3.0.1", "passport-local": "^1.0.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "sharp": "^0.33.5", "tiktoken": "^1.0.15", @@ -110,6 +112,7 @@ "undici": "^7.18.2", "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", "zod": "^3.22.4" }, "devDependencies": { diff --git a/api/server/services/Files/Documents/__tests__/documents.spec.js b/api/server/services/Files/Documents/__tests__/documents.spec.js new file mode 100644 index 000000000000..55d926c53816 --- /dev/null +++ b/api/server/services/Files/Documents/__tests__/documents.spec.js @@ -0,0 +1,62 @@ +const path = require('path'); +const { parseDocument } = require('~/server/services/Files/Documents/crud'); + +describe('Document Parser', () => { + test('parseDocument() parses text from docx', async () => { + const file = { + filename: 'sample.docx', + path: path.join(__dirname, 'sample.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + }; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 116, + filename: 'sample.docx', + filepath: 'document_parser', + images: [], + text: 'This is a sample DOCX file.\n\n', + }); + }); + + test('parseDocument() parses text from xlsx', async () => { + const file = { + filename: 'sample.xlsx', + path: path.join(__dirname, 'sample.xlsx'), + mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + }; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 264, + filename: 'sample.xlsx', + filepath: 'document_parser', + images: [], + text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n', + }); + }); + + test('parseDocument() throws error for unhandled document type', async () => { + const file = { + filename: 'nonexistent.file', + path: path.join(__dirname, 'nonexistent.file'), + mimetype: 'application/invalid', + }; + + await expect(parseDocument({ file })).rejects.toThrow( + 'Unsupported file type in document parser: application/invalid', + ); + }); + + test('parseDocument() throws error for empty document', async () => { + const file = { + filename: 'empty.docx', + path: path.join(__dirname, 'empty.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + }; + + await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); + }); +}); diff --git a/api/server/services/Files/Documents/__tests__/empty.docx b/api/server/services/Files/Documents/__tests__/empty.docx new file mode 100644 index 0000000000000000000000000000000000000000..c089246167a1de579f2a12abd7a4ae9fd789d725 GIT binary patch literal 6514 zcmaJ_1yqz<*B)Z%PNf^^?vRiMr9(hqq?w_+L!<>^Xr!b|x>FjY8$@DAkw!rB1Mm0U z>-GQd-LuxLnOU>XdC%VaJp0*?suBVs0RROB1(55+Zvgnk@a{gl**ThW*t$SKW{y_2 z7VPd2a4uBI_9G|G>t~}PRQuF$HKDSIL^8X1O|{4mz;U|gxfPwTQx__1G%dn$ExhPk zyPmiu9wQ1U**8I96KBPq_emJ_d9P95<*w}E!_Gg2~n2v$|Ihsh_{(4TCWiax%~Z zzvf6`#AjNj&7G^qCJo7|z7Vqpy7qe|lwhzC%-le^q}Zh~Wt$9ibtW{8o}+B=nV!v1 zyDCm!p4KYyzb?o2(LPGEw2RT>g^Cbs>uefJll795=+cnLB;xsQl(48#IR`Nx-zYic@o%%X5(EW*76sxhc{WcyUk~w5Y%arIb8i{tMTp%46gN*q z6uhsrO9j0o?MW74@JJa}jZj0^d?;jx`?8~4O&el} zl3cu-U^0Kxg~CH;hksOLPha5d($l57HG65jFS|F_GOU*OsDlIbFi$QM(4hL~aRXx5Ub1!>Rz0Ykk zLa9pWJQP_rTv=+MbD@mFIu<)udz(cks8*3>3Sn*(x4y^|{rWLrluh$AcdRJ@3xthLSYr*0oDM?+qm}!YgT7w3#P$*9gJ$W z-g$aYi*O8N3oYVDmlLN^@q>b8FsXe(3Vh40uIT19K~zS$u`BX3R*lr;JsUZ)aW0|3 z2^ghyGM4noY-s&Y&b9~z>G$=HzY3&QrSQ+Bik-51XxuE555(VyX9OWQ#6Q(-C*pdP2R-l`ZpFhK$4HufsFO*3#=Wse zwEK#Lk-IEdM*~eL#J6APCj{|KAFJ=L(GeTrW;0I;K5)be)bh_lE)YUAn|pCUX^h30 zl$;p%&0A&_PZ3cpWpb&es`zApy^X~lloOxv{`!f2)7T!ilO-{u{rB(U?H5Sp4cG_n z=*FuMiXho_dby)Z!PEfSOJ7b*oc4VmrE^)28Q?SNPTYFPz`o z>*9JWZ!#RY$epZg=Xh^pYGFvZh)vQl z^SUrCckHtQbd_f&UM)~PW+sxhNQ789C(~Jjb&;Tp8K#4)B=m?36gV~gU5vw?_4Uv@ z<G1zl8xDwo_n&ZxG~V1 z`hanFpimfp^9Z=YKk7>)lmH)+6xv{n8>_CrA2A#*c;bm-i4!lEfpfaaN~-`jf%a1U zJzUX7>$H>pnB$}^wFn#CHopAWcVGy7S1ma(Ez4pt1NK7iGnFMrd>AwJh~yJg6fO8Z zA;gc_gyH-CWm$?F?sF{fNOK-^s!KE~se6nMW3|XOUMc#&Cx`stotNw2ZdE!v6smp> z#A3{cyCR|z#N>n^(44qabWWwaV7++;LnK5_+?0Im58n{``k*IjmnkW=2+xvo;2qiZ z)Hw-se;eUp_LYRI?a6L-$C)MEL$z_G#+W0Di5waeV~r~DAat;%X@Yj7E659D=UYH3+^<^@-fsK8zaJ$6p`zxg{R&t4l6|*7Y3$i zOw2PHl*t_0!8xJN{TifSc?V~A*Sn6L^^6{WvdKyl@kM08dtLyemC7SfijKy0PbTZG zzYJ*5YA-YN)u+Jvm|2bAn;z&e2r?mb_2jWil}J)dY@s9H zorNid`yBM2JgEo@0wLe#&(?FXwb~#ZJ*$Pmv0RsdTOK&=RDB6CwxfLN*Ptw%lP0P# zV5CrR5Xm z4`|ZRdad-qI`c0L2QZ6k3a&6v$O`2RR)Ks- z8cXnONKm}+;f;prkDDmVS{ICCIpoqKz?Aobl&p^oxW_PCkd?z08d;m5siC+i!9qy` z@#Br@c^tV}72l{AqKPV|iZbe|G_oBsMLuJd&P#|Od=51EC^X5g50g+{MFnDncDz(X zvwL}(Y{^^WH)-3!}>+E?kQ@ zrnK~nKx2B~62c@sqS>tM!H4Sk@vLMIuEKZG@&@DZ1r({1?`o^niJ7Ad0&c6*X6z&(IljUNr(ZJci4qZ zrW$5O`*zk9JF-!boy5>f(HJHhEYl{*@vd$eAr)+(Qt1jb@h3F49ed;l?Gr(5 z!<@lQ?O;i(%!QMl$BCkkgtT)GqHZkws6D?XCRc18x7#Daq{O~36}MZ$lxOvlErG+< zyr|8tJy($Kv4OR8#L?iuL z6DRm?9txeCshprx3`1o$&eYc7}nIxE zD2yhefQzU8vY^1TruZZLI@`7T+s8kNDx2j9O~u_gnD1E2KcDmt@ctN;oh{8EW*k4? zx$bdoS5L`qfuF1md)AH0+0JZE`zxYL_-w^OsR}8bZ3u-`Si-Dow#2xCH|adgAFi{} zB!qAl5@MnsKs1|s$vY!Mpp^Q`Y;R<54n!rix~##SLZqCA01X)B8|*CM2gdy2sQ;R6=*^PAkM zcyq#x04Z5m3(%Qlgqxe*v}=x!wi$RZ%?oG#NPn;_Ok?rGI{T2chI=eQntaWS* z?kQlVI}!`10$k<`tYQ2S%SfD{t|#vq$WR3FNi!}8eYmz>MQJ4N75HZFu~_5n;8+l{ z+?gN0s$r)?rbf&8O+*`|Pxy`J(PBi1QSzaB@9oWmMceg7?!yA-<>L>Ow}&T($9Cek z#~~p|=datP$;C5q-)=DoNr;HTcVfWzsisANa6m;G;I4o!uBTQQTj^*lTdDY;%Hd|H zwXnb7j>0Rq7sI>=gWaULv?l@06taZNM4Kg@!DD$lg{+I{FhNdq;+N!gL5Hmhu5RqD zEe`Mv$||f))P)5NLsV;mR9tG0e0t7MxMCJ_t5;{(zM``Kkk4u&aN*=@G?fS$PX?&wC-9Nw85 zD3AQgVe&W(%8;TDQdc_@uEuhjt(&DQq6yCSVpzlc)bQr>n%5$ZT$q{#Nr><%Z38k> z_!z&AEWLY;)VNR=T(456PY9k%?R=|B&}bUfc3cH#JmSifXIeQfgTvSgON(&!p+l5N zU~AM!lY?}z_s#D6TeZP20LCP1^Hw#E&Y`+Zc4Bd#%cf%nirxi8v{fG`-=(yr?v3V> z^pQ6Dk;d{;%4#0|v2l&FQ104dhRDw`dbbJ2=}RHq;8@)FkSn@ZCe&qt=!nBT^7h|R zwN`9H@o`|Lc<_4VSagc@uZn|fpl+^@bB H=c6D`}bH!T>fyBZUZNPEA^;cB%`1m zR>6GiE$N($$m!}Su$0_}Ud1XtJE=vgv{prX?XL}%@&unkGcLp|I57uFG2UuSfCxYt z4-)t;(|Sk`^Sl`7hkN^LfQ$8B^S$%O17k~u{E6%_#~W$3 zZ$Cz3$3oq$`v<_!MPV;xx@@;7GpR`<$T2(xj=vohzItku@f2~*wMxF(dWEQl>vkZo z;NhoO9OAhztS0~httE=7mIUc9W#4}Wwpdglhb?DbKsTeN8-)M zl>#fCJmx>bm5mdyYh}z2VlLxAeWqXV9Oabtg(KYhP}FYiNv4IOiEdAzsXv~wQ`NN4 zfzy|@f$&a zCor+F6K1IGKEO{;A*2cKmhWdMc{5)HHM~W>Z>-eMvtO~^4aW=Q|JH8s@7j&N^qnSh z$DWNeJ?zb#4DXvw4OHdnPb$(*PGhGAO8`n0SP@Gkwr7*I*D#g_->xM?PfwM{`SG;g z;>4(!ZVok{MX`AZFL~&IsozIs;l(+!qM-$A(cWlN}cJR(NW1Ytkgs4Cue z%mARC!eMf5k$jyLKGRfs7(>!F84E5Eqi4lMumQHfvAd;xb)}ccl*Ce`S1R} zYF_t4&|e0-+wrIWuQKfK?!Rg$f8Rdv?qPqr|4k|SB{%RKH1?vL&N)LVXc z{*^2L{n>TU{_FfJGydKESH5^psQ)q~%>TOo1HJxzrC<5jJ-__RB(eU+Hh*{g^=94E ywZDwzE*d|N|2uj6eTiR-s5j04 literal 0 HcmV?d00001 diff --git a/api/server/services/Files/Documents/__tests__/sample.docx b/api/server/services/Files/Documents/__tests__/sample.docx new file mode 100644 index 0000000000000000000000000000000000000000..c7e1c02b65495f5241283a648ef6f9828fa3a6d0 GIT binary patch literal 6553 zcmaJ_1yqz<*B)YE=nm-t>6T6j>5y&&N7|vgq(m4(5Qav&OG3J(LApVtOG+9=-~;dX z-Rt%L@7=T3teIJ}&w0<@`#k&EkD3w^G9ds20s;7(_>BO+82;^NcY7x@PCHjyh`E!s zoh65dEi@OdWcPs!_toPO5t;)qLUpJtGO^5FUSka^DKu8^BDbPr^~{w92VI+JOdCJy z#=bjtk;jAzPO&Q}{M1FUJ3A4xF7FlE+uXICV^={sq)kblXxU>pMLpd>SBFpfmQ<?CEIAEr#r4?VgTCUGdrIK zyD3gxp4BMvzbeP^)j3MFvX9p1g^Q5r=x&-yQ}j@h>CuwOB;dc;C}CBnaS38xa0J>o zMM>=!Su<*RN7s<3x-qX&c#%9u7X7S)Aoks}ur@0;{2fSg9RrFviz?|XVS;2v9mwl4 zFpIH8$!lSu&nRN@Cea^LvYNrw{c6_H$2=j5g3aD~_QuNI3VnX=ssd+k1?6@6K9aH> zVEhtLWe8Xgc@e?H#3>ZG(38YZp*|dB2f(r0QT*WCS;XvNU{lt%r*<7j@I=WLmfZQ2 z21yLd44=IPK}`u2^wMGYUN!;%kcq+XOrR%y9g?)7))_(=!g* zwy1pMGnff_l~WYFM~x(@Oy9gLmCS!*2C_HltAS#_eVwT*f3yFB_1!dUnu$1&datlT zxewUSz)YqNBYIk?ml1V3e}NWRf~4Tjsz9>EaT5fYC8#my5@ z1vZXCfQWjsqWvcpy0&kn%=icvy4&13QbEtiyOTs1Jd=mi!qqXf?g`oBJ!>yl*ReGQ zB^B?*Kb^bo1o7OrCpao{pf7N7?e0|DAI`{OPpWytu%GMs>>l{iPptFF*Dd!E=}J5# zR1>TAlt-xMq5SBY&zrx`k%~a#v6)OIT=1k{JEZZXJp#m1GVt~Q3kJjv>*up33$7)< z8|aCz(L`}a-8o{C0JL)A!3aDCv#%G1&3G!AIpGJw{QHlok!M@VZ_4HgVt z8E!2!cqqXLqXLD^+k+O{GL0pQBk%KxZsablwQ?`r{F*}Cj(`^9^MG$erf%d*+hqRR zo4yg9a%oPa^^rU~f^0)UPXVZImPpHnQgh6DN(+n*V(^>(!vS1!HP0`o50<=WvU6Ka zKvl^dhayYH%Zv5*U8y6nkHs$3-(=nwRIkW1vt?-zxA~qY`sG8w2)ov&(x+8UigK&d zqP8CgWhUO~4CYQr=Jj&YgC2ew+AndJvEtHxg~jP9pOm`3WWFjp4o_ePFvtZQg+ zJZ5RFj1_$nJ9^)v^DQDl`UCwVsD5cMT7Cp!a2TK7D(kk-<8fzeZoSj!4!KxCtI3<` zI_6nHDS~sUV&|-GT6e3Yui~%8(}R#4<5cz9h`E{a;9vcQTJW*PFcYWl>1Ke>A8hOs z@4X~rd{7pwtBEcY@}f`oCj{|K9cvt{-6t`@%VL=jBz3|L)b`IrEf7LBpMCO`+7z2B zF)1N-*GFarUlCa>d1A4;s`#Xzqm|VGk`tGnef7w&adaQY*@}eG;q+9z?K?_&JuQ1hRdw2-i8H^)Wz z>YeGi|52+7D4LsB*tPTwyr*R7j=%^56Th~d#fBSS?RKJp(U{b7s-OG$NBL4ww~F`h z+b@dcnU%{*L1Z_FzRo$LgvzpYy9l|zc<57?ER0m>-z`BZ<7^q8bG*MXIX|de#4hQS zaaEX_JNiihzQQvdryi&gJsm+;BtoK`li{Mtwm{g)vZ{-xB*a7k37nid72|YZdo}o0 zc_c~3C|3|)xia_x2TzUO5@&v=<5LWBvTG011}C0cLsVh zUnt%V9KOomGz_iqkNg|~CnT^<46QfCi_tJV2p$Av?AMQ1k%>?fHYFeWBi6^f`q~}2$DEi_gl|RN|CZuv z@`4P0u#I#t>xYD!-N{~7`?(duJ@ql(E}W++HF17ErgPrqNdXaDX(F|VK{{?c;bx@) z5=DlvObbQ3!jkSr6#5&hW3m(GwaeUfReoFKt;+Xc8)z9;x9zLml+x5b@CTtlQ)kHc zcSt?_FDxFB(`bH#)OSru+9@Ucz@GwDug@7v%147YHim_%sUp@>3eS9&9hZwZzZ;pM zGqX% z-DN<%c3YY83qvaG52?1Iv#|vU!te1nw0L=_F# zRoMjN-sq4+0=j?5SjIzVgxE&CjiJTo@+L^yf@vz3EHY0sqdC?x)1Oms1Z@t@9KX(o zYQJraA@kmgVS!d33CU^QWE5P71E1 z`?ZwHd(X2dp*0qwb1a*Hq-bGsOCnXh1Ui@vGuq;d}xYw@3n4T3y= z0-sq`3iJKick-wrC+zEDgx_O8MK+yd8g`gh^albs$adba89bI zLcfWAf$(IBa9VW+-3^-z=YjIv~BzK&{`fW~eCK3R^ zPV{e}{wL=!vw!AlYi{TAU(Ruh>6j9&rZ=Wr&5jf~#+JO>(m~irZfJAZGpuw2HMo8F zO!U@gr`ZSA8icAAq|pjtA@D~^4UK-2EHmgXV}v}!k=<;#KmizA<-N7M9XJL1Uyida1%4AfsdZ8#qQ>hT zurFurRl*fTJzUt(08%d8ByX4Aj7>;Xc(byPECfIn*AO7Jo&( z$C@0uH{k1qwzq^6Q8tXEtO^j=M6i2|VAS-a8`O6kO&5DCC_pJ!bJ@=2k6P;|ec5y;+q?PD3RN z;g*>Ed*lK*&$TczRpSOUCC%GIwa2k;k}vaO*A%Z6jt*(~~vot5qM8g@@P{ouVK zXsuY&xW(P9Un+y~JHFsX<_h~gvkVwVfvO7z7`AV4*1I4y;|h-Gz)TN!mLEqe(#z&n zy~A@YqwhxBf1WXzcB4^}YKJskJI=xfesXM2e2i~!3R^-I%&*hJWOF?<+~d@*Y%DccZjx;^*tQHu8rvCPb( z>c-72b4CTV5Y>?;+)|a5f2ArE|La*MPUcW&&R=Vr9EhIc+na`>N%im$(PHr7OoGr8V3No~OO$AHTW@ZZLY_qyNx2)fza$^vP==12wD72y3%F*q% zM%i4h`;4HVFGz235ZxY#Cfs84STNkkZOlxH7U&3Thda+`Ca=BJ$>~RRF@NygN`}LH zf*0h8N4A|;R?FK$-u6&=j1{Fq7c2TH+C|h}hphiCQHQ51m*j5fT0o2TxwpunrT z_ygiP`<2I=hd=o&yVWpl#qBv*ZpqC*pY#?N|Cp#51scu~i9uUsmFG(Qe+9CxwrPd{SZ5Vauw;qF-gZ-Tok+#8V35!M{TL z0X>hl+&2Kn-@bF#tF#6pm)K1l=7a~!jc{Bi+Id0RiM3~>LO&x;gqx6S_H#PFrPL-_iT}J|c z*yN8p>at#RV=fxC)!MKQMtT7kQe9#ybN{cST?WRe(!; zfz^!PVi-x{HT31Z0vU>IeN#;fLhr4uS5cdYdk4PWe<;>)GcX#2DtGS3uV&ogn4#Hx zaUI@D?HhLOb+iy3Vv=;I(Q|V>ZrOVEJ@;OL%hK^X>YKxp!()5#o8yoWl#5qw(v;#E zcyG2Cgd{{n5j!vud(~1SA-IqtO=xF8C%39K=2jXy>sAWE$8v;eur|(Tyb(m@w&GQ9 zqF{GvZk-806O}B{67gn9NAPIgP9fU@#;PC}2FWwZ+MvT01vht&mS#u9dSw;1MsQ(4 z{UFVnAPu)VlW+Gqh&y^A_x;K=`xi8hZ}ORqgsxnC4Q3J{{mL$pvbYH_1<6qNE&)S# zwGbYHOO;9!87Xa33y6YVSETNt-g0jN_v^&hO##|g8cy!CO<^6mf%2#?9Vd=qaE4?< zh=%&P@Oy0MncA8AMYO?L-VAG4AM0O#TJv7Om4m5Ul7$GL(bc0ehmG><%F=sOON|M2 zBJ?PA_=e!S*37l21dXK9Y{yn`#UU?GdZm`*F*uGcvo;H79Xdve1hzyDH#$le`&{p3 z->46K1~4YtShT2fb_~{Ta*&AoUN#;xQ1#3sqp$cnzgSFN?AmB5NgHmZA8sfwrGC%D zKRTv)9{QlBm?7d*wEj)JY1(2)7c>ShF677kmrucEff&d`-SQ5nXxhtmp#-?AX84Hu z<=FQX>s}TISHs=i9_E@l*KDYA#`$;KgkOGhl5T~@Lo4-ZTqPsn?bgA39L;H5jHqcE z$*ak^^*xGJeD+ccG^s6$4mw}zt>g(mhNgcPv*f}WAjf>8ISwI&q?5+;U8Z)EALe;8 z&=2+WRRbC4!zVjezF<_`__@nhRs4X1(xJTkLbkp%ras z9U(sVq;?`aShNkn09SrLm5NDjC(qT69nWdUH&Rn8hWzoYQKxHZ_T6tIF{7a#Hhuk2 z1JTuIGM#o?)EQv%a7s)sf#cny!k4Nh>8i+UZdLM4Hp|4-+&BGs1@}J2;F8RK9)qNJ zR5=z(^N&wdC2U=X!6b^Uq1*zXSkWC3i5#OA^Q@*Kk&{Q|yho&m)Em@|>D;PaY zs>@0Ie5kmxOKb^gvxsr_p<1t$iBE$Z?$qrP_NN$IeEdA#uuB-=1xzUHST)x1=;x=W z64F9+&-XKyyq>Fq8{eSbHCC`e)=ReA8G4@b-`WkqZM!j)zLidHNw$fWr-Qk(@m;g2 zhO4Olq#|vUwD#(&@j$5pYZ8fswk(RaYR1ywo3$kP*_rYfKfd-G+-McE&B3PgNOn)* zMNeHQI6E>EKh}v29VARfJ{9L-$RX>VC85#IM1rIR!C~5{D&BTV2VmG?>%p+p8xrtO zQJOwHU;vd&aC^%%4L(nTztDXiEwFl6I>C*LOs2jAwiuTy&|WR@5d2|Z%9^R+q{9l~ z=(EW3n5*z;gPtFu`p%mUnaWm_m7);1uGZ&96gNngp;Vd1a)L*3CtD+hh72tnjIr#X zA9jv~^t$+$gE?sok^GX@=qzHLCj5KdH;rKuoyE_-6!)K$h|c%dXrxUXdBk_njC^i) z2fXFC@)>-hRPE%Dl#zfM42m?7+&DE4eL_W;rQ1}w{6OC1D2k%EyZ9p@5(55`(0^54?*y#B+y52RfA{}Y8oQg7{<78E9e?`& zs@DGQ{;S0D_w563ANHsF-_*3<{eK|`=U%B$%pIsOIzs|oh z?{qsRY$ivAorP~x)52YafLc0XKKIUM|NUH)i>(+(>CoUjaq9WRW z86yvR)Fm9e0Oy?A6}DI2B2lGVs#dh#oC`%Xg!kx*S4}yAE6&)bJW{>`7JkhzlgLKo zZZXrv{I2Tk^UGdl*+ta%LmQ0MZ^+3^h*HeGE2f-g$3)BIl6GFbd!zHk!sV}uB#B;s zmZx$}F1#T~B$!$|&>HYBy-2pHlyY`Qio>7YYKL&scH(6&@hd$!-913lLtn9A{6L8~ zzVb!hK{e%lSyHJ3=JnB*Yc*|s<}IV(W&o>BgFuDKy_Ud2F0+~#IL!U?qcVE})<^9v zJut!Xl7GE*{1KjMto zE-Mz>nB&7^V^^#SH5eNp%?dY6iznBjWUh^E;`?+50eas@e^W$)_iC9Q>IxlgVlruv zX#*6_E44)F47Pvx_-15BM$HS=I=-{q`OyO|Y~!!i%qZ3#$#*R6dTc(-luEy| z2U>^|v@0|VSq;kUx5Thi$VSPsMmbC9Xia5xf3YPP;5mCRPntO|>I>Su=D`ry1|m$% zMlG-~)DogTx63d-U(FV2wp=srNUX6nZz69N^PH;IQ~2nus_Bcb-mmuEahgqUiwrUR z<+E*GZ4#1frGBLfpg^>dOl}VDAYtadf%qw8oPq$hWF7>5%YbHfR=Z zbxCLGSx^$MQqR1MFV@D=D6rPAjtigL#yrI1Z0|SXmhbpSWwa|hbi4Ghj4q$~T^Vuy zV5AeVH+O|Qp3DHDqwir|8f5iQDxwKmk3gtbOn<-=@nx7lGV7d$8V2Ne4zk)aoD`Nm zQ9sks<=tR@5hR9X36PTJM^6(%i`wQ}fi#_?cW?ysw8mjrEW{E`qJ_)T7PoGbyp!y0 z!&InOA0ItVPTi+Q#OeOrQTrYP1xEI)|oNpgmA^X6aWVNyyEny;;P&kyfSFC$1wqF*av zvBKkW@|C4(Z1;G5;~W~4;5{?`(5=_!1cT3q-y_KJR5do#Y;APJ)kNEyt#I$#><)4%m3ZUCx@LC2ZtF zW{mysMVmZcl@ygb&VLylbfGW*28Q6Bn&jRTEp0cvMQ$YbOXucORx&A_!W9;V)Z{-t zIkHl8ZIMqv9nC^QFl$Tg9v?qz4l$B{1zlpL&PcPk;NZVcImssMICFc^#eHgVrF;8} zMD-|eD{4zqpjDsJG>usMd9UT_+!NdBN}{Per*m25f2gC>W#7`Wq}GMcq7fj|F5^g)=fAkv2OePnvVL&6N)*kAjc5WaL?P82B6ieE}S&z~2=>DLJ*n zP#DRmx?EkPCff%tj-ZIKF-I`DkV)rWhvXi7KbiY3HyaEAnU*Dh!Hnb*&FZt{oHpx} zRWt4sJ%$K?(_Tr;Byfe!b5UfF3vh6$F5En5>!a3!ly7-+%-Q z43mC$QY;ZQ%DAuvo2?sa>`^FPuGH)4$s5+>I`H5qZoa}7r}jjLWPCvUr_h%lX{o>5UYn;Wuu&GpkG(|8U;Q{Jwvmj3nn#SD!BZ)I zhki!3;hL6bsB$M6h;(uGDp@I8|JGoC+Tg5cq3B{NDrhoq zSfD=@@5Cp_*SP3jmDtBVUsckcf?j{bnI2Hf)(l?*o`agIXnbcMGD#;=oS`ZksAV=J zsP~$^AfG+SGI`IN*~d@uvWUsB8~wr9i$bCtW?+SLf!x6QB@e%G*4q^ogT|&0${M{0 zsuxX#z03SIjjI)C79H&dS*ynvQ)^%XS%LOOgK@5*fPr^d4i`;6Zc2no=%^fS5|iP zAmuODMNLYq@+v94ijTv~bKH+!((WrrYP<;;v5Yc+?&k(8Xk7T{gGm@#v}NK!lni!` z?37Q&^*6VlU0_I1r6!&B&q~R2rfZ!Wko0Z8`oJI^bajj5^IX*TIzGLF5g}SejIzWa zg+9{pvqT7PF;702Q&@K22YuOhG2F`R6P-x}i!|)4v4aAicwBvz$HVYfyG8mdy~+0M zYpu7I_7khoZLEef5o3R}2dsNP^YBuEW(qcCRZR6_?lm*9ooRB#231SP^m$uL&aD{R zRP(RD+|(IvE!rkNDl~Y;BTJsci8UYiyH*4KQD|=VmacFceU$5w3OJfr;SnOp1}Md4 zo$gB{Bi7PP(dm(~Xr+Fj6NtS9B~X`c5+&3-Y?zxpH0AYY1M%DK=dks#0BbJtT?}CW z*p*@4_T$=Ar591-WbT?ZSMe7=3cF34_{)8!vGOX|LhFRX~L4+O^gU zy4_{j)tT4PP7yNOYH7;PuL`^mOPCx8Eg(;=V+!wlQsuD<3lZCS6VhHRabG|Hu`O8^ z=bW^`s)9jUe{Zg=S$uHJE=JlQ`J?_S@Z0`M`PZRQo{sS2)0P;#`4_J|Z>$Q=<0@jE znap{%5t;vG0Ormn_MnS@}-$S zgBoJx+p!)W@kT>bKg%HNT7$Jky90OhnI>(f-Gt$gmI>v{yNrQeO}=wkGnNE?s5!j} z!3g^u0&>4+)#MH_7aa}3sR=VfJsmCzNau?3$wFfflC(*f4?HcF{%hiV_^61`4K6RO zHJD7??z1TuzhZ)j%05`LLuYV*7Cr+3wF{~+y^1iW{Yz4{tKY1_gi>K8*rJa1p*{bQ zc;g=A!ySINiHv~}h<&FuAphE@ilJt6p2rw?q?QgLPC41h$xF* zZocy|2C^hMd@6sa%5r~hDrWZGVBEG}o8T7kibLO6FP%C%gqHkilWg|tcCL`}mCLRj z1FzDh0)tSdR`B*-RwrviKagL-Igtx?a7?Q%!JvBOtgF$Uru(MIs2)!2QRVcDn9Qg~ z#Jb$ow@E51C0>In8B>l^7ekFR`%9@fZ>m4=lwX>Ws?rzCxMx^7{5|S(URNBaEModH z%lCBdLGc#4{kYH1(V23$X-nE`LVz50lFt)kaM=o4`^QC4BvVw~cbE&We%o?EKl0&d z-m-FbMjR`)=y7B{J2)`aVx`-qpcuyBFht7Xb9_rsp(=M7aE#D!3bW-xVrv-Mn4sjv0tgUiAA3D)og&tF>()@!Z0!*-IlGnG86L*O>9=y4 z`A1BC4F4x1b3FavqP1=}K&b)_>FLSfKv7C*ERA#$O-rd3i2Y?D+)6_^@nCBsMMC@! z_IM#p!$~=jmHYO^W8M@vkA`@e6)@Nch;O>AHET>Lyt}+E_e6DwY1Au_45s-y_f@~n z2gGE8O;zNX!1ByGngC*6eWyF!ylVi~adCNI;?1TTM5b)0kfM&YOHA2Jx+(DB6g?_; z3|=7{!NG-LNiD;vi~@6F6bh|O({-MB;lp{}t+*@L7AcBp_+!_rk5#R^oIQM{7Aki7 z(${ZX5$^(fzR?fPl_Pe;LOQA<0&S@08K!OSqrZw&#ScGx%=^${dT_Hq{+QhqzVcwd z9{L6LThEan{^Ez*{u&gOfWA;F!tO(*WV>es`mct|0GZy47;<0GN`=f+cSgJK1sjN_5 zM`PvZBM)<*md!b5$0X`%{|>{c&>jzSE|0k#OSCQ2wK-DDqI*C7We>i02D!!Z!WXur zN~EsG`9D&7n%L;fgCH8tP_mm`2M~p^CrtwAx);$8WpCmRg%L4HrZbwo= zpt&6R-p2c4;xhiV^R3~c*F^&_a1`6=5rnAwo@ww^8T>l*3{ zvyH(J=C=lF;i}b^LJ#<{1(IR*!;E|{6bDilsbZM}>?(Y2GVN(x0H60o(?lVXklrSx zyV`4n=M|I$Gw&hRiPb7iiRNq2--|le5J)saPfK zaDi-dW<@L2SdYq&g09-|xB+6Ft0jt?xQU!Hgt5v@P2zgRX1iYvlC697(oslqRJw(( z9fre)@Mxd_;Dhmg-YK5EB0IWM{MG*54deeFMdR>){|4n)4xL6hdC&0cUgGeNQBLdX zV`*_3<)l*Q{N*^&hrQ%*FDFp`_J-m#;7Rp&YFmd6{T1-1+&_(SQa=5f%3*=!jl#C ctM6-R{+8HuH3*50&?ygJ?uSj_+Ofm`0f_d>wg3PC literal 0 HcmV?d00001 diff --git a/api/server/services/Files/Documents/crud.js b/api/server/services/Files/Documents/crud.js new file mode 100644 index 000000000000..cb6dc7722237 --- /dev/null +++ b/api/server/services/Files/Documents/crud.js @@ -0,0 +1,99 @@ +const fs = require('fs'); +const { FileSources } = require('librechat-data-provider'); +const mammoth = require('mammoth'); +const XLSX = require('xlsx'); + +/** + * Retrieves a readable stream for a file from local storage. + * + * Throws an Error if it fails to parse. + * + * @param {Express.Multer.File} file - The file. + * @returns {MistralOCRUploadResult} A readable stream of the file. + */ +async function parseDocument({ file }) { + let text; + switch (file.mimetype) { + case 'application/pdf': + text = await pdfToText(file); + break; + case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + text = await wordDocToText(file); + break; + case 'application/vnd.ms-excel': + case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': + text = excelSheetToText(file); + break; + default: + throw new Error(`Unsupported file type in document parser: ${file.mimetype}`); + } + + if (!text?.trim()) { + throw Error('No text found in document'); + } + + return { + filename: file.filename, + bytes: text.length * 4, + filepath: FileSources.document_parser, + text, + images: [], + }; +} + +/** + * Parses PDF, returns text inside. + * + * @param {Express.Multer.File} file - The file. + * @returns {Promise} the text contents of the PDF. + */ +async function pdfToText(file) { + // Imported inline so that Jest can test other routes without failing due to loading ESM + const { getDocument } = require('pdfjs-dist/legacy/build/pdf.mjs'); + + const data = new Uint8Array(fs.readFileSync(file.path)); + const pdf = await getDocument({ data }).promise; + + // Extract text from all pages + let fullText = ''; + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + const pageText = textContent.items.map((item) => item.str).join(' '); + fullText += pageText + '\n'; + } + + return fullText; +} + +/** + * Parses Word document, returns text inside. + * + * @param {Express.Multer.File} file - The file. + * @returns {Promise} the text contents of the Word document. + */ +async function wordDocToText(file) { + const rawText = await mammoth.extractRawText({ path: file.path }); + return rawText.value; +} + +/** + * Parses Excel sheet, returns text inside. + * + * @param {Express.Multer.File} file - The file. + * @returns {string} the text contents of the XLS/XLSX. + */ +function excelSheetToText(file) { + const workbook = XLSX.readFile(file.path); + + let text = ''; + workbook.SheetNames.forEach((sheetName) => { + const worksheet = workbook.Sheets[sheetName]; + const worksheetAsCsvString = XLSX.utils.sheet_to_csv(worksheet); + text += `${sheetName}:\n${worksheetAsCsvString}\n`; + }); + + return text; +} + +module.exports = { parseDocument }; diff --git a/api/server/services/Files/strategies.js b/api/server/services/Files/strategies.js index 2ad526194b78..0f669824a035 100644 --- a/api/server/services/Files/strategies.js +++ b/api/server/services/Files/strategies.js @@ -51,6 +51,7 @@ const { const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI'); const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code'); const { uploadVectors, deleteVectors } = require('./VectorDB'); +const { parseDocument } = require('~/server/services/Files/Documents/crud'); /** * Firebase Storage Strategy Functions @@ -246,6 +247,26 @@ const vertexMistralOCRStrategy = () => ({ handleFileUpload: uploadGoogleVertexMistralOCR, }); +const documentParserStrategy = () => ({ + /** @type {typeof saveFileFromURL | null} */ + saveURL: null, + /** @type {typeof getLocalFileURL | null} */ + getFileURL: null, + /** @type {typeof saveLocalBuffer | null} */ + saveBuffer: null, + /** @type {typeof processLocalAvatar | null} */ + processAvatar: null, + /** @type {typeof uploadLocalImage | null} */ + handleImageUpload: null, + /** @type {typeof prepareImagesLocal | null} */ + prepareImagePayload: null, + /** @type {typeof deleteLocalFile | null} */ + deleteFile: null, + /** @type {typeof getLocalFileStream | null} */ + getDownloadStream: null, + handleFileUpload: parseDocument, +}); + // Strategy Selector const getStrategyFunctions = (fileSource) => { if (fileSource === FileSources.firebase) { @@ -270,6 +291,8 @@ const getStrategyFunctions = (fileSource) => { return azureMistralOCRStrategy(); } else if (fileSource === FileSources.vertexai_mistral_ocr) { return vertexMistralOCRStrategy(); + } else if (fileSource === FileSources.document_parser) { + return documentParserStrategy(); } else if (fileSource === FileSources.text) { return localStrategy(); // Text files use local strategy } else { diff --git a/package-lock.json b/package-lock.json index 4bca60d4352e..cd6cc27138b2 100644 --- a/package-lock.json +++ b/package-lock.json @@ -95,6 +95,7 @@ "klona": "^2.0.6", "librechat-data-provider": "*", "lodash": "^4.17.23", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "meilisearch": "^0.38.0", "memorystore": "^1.6.7", @@ -117,6 +118,7 @@ "passport-jwt": "^4.0.1", "passport-ldapauth": "^3.0.1", "passport-local": "^1.0.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "sharp": "^0.33.5", "tiktoken": "^1.0.15", @@ -125,6 +127,7 @@ "undici": "^7.18.2", "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", "zod": "^3.22.4" }, "devDependencies": { @@ -11380,6 +11383,256 @@ "sparse-bitfield": "^3.0.3" } }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.88.tgz", + "integrity": "sha512-/p08f93LEbsL5mDZFQ3DBxcPv/I4QG9EDYRRq1WNlCOXVfAHBTHMSVMwxlqG/AtnSfUr9+vgfN7MKiyDo0+Weg==", + "license": "MIT", + "optional": true, + "workspaces": [ + "e2e/*" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.88", + "@napi-rs/canvas-darwin-arm64": "0.1.88", + "@napi-rs/canvas-darwin-x64": "0.1.88", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.88", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.88", + "@napi-rs/canvas-linux-arm64-musl": "0.1.88", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.88", + "@napi-rs/canvas-linux-x64-gnu": "0.1.88", + "@napi-rs/canvas-linux-x64-musl": "0.1.88", + "@napi-rs/canvas-win32-arm64-msvc": "0.1.88", + "@napi-rs/canvas-win32-x64-msvc": "0.1.88" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.88.tgz", + "integrity": "sha512-KEaClPnZuVxJ8smUWjV1wWFkByBO/D+vy4lN+Dm5DFH514oqwukxKGeck9xcKJhaWJGjfruGmYGiwRe//+/zQQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.88.tgz", + "integrity": "sha512-Xgywz0dDxOKSgx3eZnK85WgGMmGrQEW7ZLA/E7raZdlEE+xXCozobgqz2ZvYigpB6DJFYkqnwHjqCOTSDGlFdg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.88.tgz", + "integrity": "sha512-Yz4wSCIQOUgNucgk+8NFtQxQxZV5NO8VKRl9ePKE6XoNyNVC8JDqtvhh3b3TPqKK8W5p2EQpAr1rjjm0mfBxdg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.88.tgz", + "integrity": "sha512-9gQM2SlTo76hYhxHi2XxWTAqpTOb+JtxMPEIr+H5nAhHhyEtNmTSDRtz93SP7mGd2G3Ojf2oF5tP9OdgtgXyKg==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.88.tgz", + "integrity": "sha512-7qgaOBMXuVRk9Fzztzr3BchQKXDxGbY+nwsovD3I/Sx81e+sX0ReEDYHTItNb0Je4NHbAl7D0MKyd4SvUc04sg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.88.tgz", + "integrity": "sha512-kYyNrUsHLkoGHBc77u4Unh067GrfiCUMbGHC2+OTxbeWfZkPt2o32UOQkhnSswKd9Fko/wSqqGkY956bIUzruA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.88.tgz", + "integrity": "sha512-HVuH7QgzB0yavYdNZDRyAsn/ejoXB0hn8twwFnOqUbCCdkV+REna7RXjSR7+PdfW0qMQ2YYWsLvVBT5iL/mGpw==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.88.tgz", + "integrity": "sha512-hvcvKIcPEQrvvJtJnwD35B3qk6umFJ8dFIr8bSymfrSMem0EQsfn1ztys8ETIFndTwdNWJKWluvxztA41ivsEw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.88.tgz", + "integrity": "sha512-eSMpGYY2xnZSQ6UxYJ6plDboxq4KeJ4zT5HaVkUnbObNN6DlbJe0Mclh3wifAmquXfrlgTZt6zhHsUgz++AK6g==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-arm64-msvc": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.88.tgz", + "integrity": "sha512-qcIFfEgHrchyYqRrxsCeTQgpJZ/GqHiqPcU/Fvw/ARVlQeDX1VyFH+X+0gCR2tca6UJrq96vnW+5o7buCq+erA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.88.tgz", + "integrity": "sha512-ROVqbfS4QyZxYkqmaIBBpbz/BQvAR+05FXM5PAtTYVc0uyY8Y4BHJSMdGAaMf6TdIVRsQsiq+FG/dH9XhvWCFQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, "node_modules/@napi-rs/wasm-runtime": { "version": "0.2.12", "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", @@ -21601,6 +21854,12 @@ "node": ">=8" } }, + "node_modules/bluebird": { + "version": "3.4.7", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", + "integrity": "sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==", + "license": "MIT" + }, "node_modules/bn.js": { "version": "5.2.2", "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-5.2.2.tgz", @@ -22854,7 +23113,6 @@ "version": "1.0.3", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", - "dev": true, "license": "MIT" }, "node_modules/cors": { @@ -24239,6 +24497,12 @@ "dev": true, "license": "MIT" }, + "node_modules/dingbat-to-unicode": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", + "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==", + "license": "BSD-2-Clause" + }, "node_modules/dlv": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", @@ -24375,6 +24639,15 @@ "resolved": "https://registry.npmjs.org/downloadjs/-/downloadjs-1.4.7.tgz", "integrity": "sha512-LN1gO7+u9xjU5oEScGFKvXhYf7Y/empUIIEAGBs1LzUq/rg5duiDrkuH5A2lQGd5jfMOb9X9usDa2oVXwJ0U/Q==" }, + "node_modules/duck": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", + "integrity": "sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==", + "license": "BSD", + "dependencies": { + "underscore": "^1.13.1" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -27588,6 +27861,12 @@ "integrity": "sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==", "dev": true }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "license": "MIT" + }, "node_modules/import-cwd": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/import-cwd/-/import-cwd-3.0.0.tgz", @@ -30052,6 +30331,45 @@ "node": ">=4.0" } }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, + "node_modules/jszip/node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "license": "MIT" + }, + "node_modules/jszip/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/jszip/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, "node_modules/jwa": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", @@ -30354,6 +30672,15 @@ "resolved": "packages/data-provider", "link": true }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lilconfig": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", @@ -30997,6 +31324,17 @@ "loose-envify": "cli.js" } }, + "node_modules/lop": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/lop/-/lop-0.4.2.tgz", + "integrity": "sha512-RefILVDQ4DKoRZsJ4Pj22TxE3omDO47yFpkIBoDKzkqPRISs5U1cnAdg/5583YPkWPaLIYHOKRMQSvjFsO26cw==", + "license": "BSD-2-Clause", + "dependencies": { + "duck": "^0.1.12", + "option": "~0.2.1", + "underscore": "^1.13.1" + } + }, "node_modules/lowlight": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-2.9.0.tgz", @@ -31106,6 +31444,48 @@ "tmpl": "1.0.5" } }, + "node_modules/mammoth": { + "version": "1.11.0", + "resolved": "https://registry.npmjs.org/mammoth/-/mammoth-1.11.0.tgz", + "integrity": "sha512-BcEqqY/BOwIcI1iR5tqyVlqc3KIaMRa4egSoK83YAVrBf6+yqdAAbtUcFDCWX8Zef8/fgNZ6rl4VUv+vVX8ddQ==", + "license": "BSD-2-Clause", + "dependencies": { + "@xmldom/xmldom": "^0.8.6", + "argparse": "~1.0.3", + "base64-js": "^1.5.1", + "bluebird": "~3.4.0", + "dingbat-to-unicode": "^1.0.1", + "jszip": "^3.7.1", + "lop": "^0.4.2", + "path-is-absolute": "^1.0.0", + "underscore": "^1.13.1", + "xmlbuilder": "^10.0.0" + }, + "bin": { + "mammoth": "bin/mammoth" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/mammoth/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "license": "MIT", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, + "node_modules/mammoth/node_modules/xmlbuilder": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-10.1.1.tgz", + "integrity": "sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==", + "license": "MIT", + "engines": { + "node": ">=4.0" + } + }, "node_modules/markdown-table": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", @@ -33522,6 +33902,12 @@ "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==", "dev": true }, + "node_modules/option": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", + "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==", + "license": "BSD-2-Clause" + }, "node_modules/optionator": { "version": "0.9.3", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz", @@ -33674,7 +34060,6 @@ "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", - "dev": true, "license": "(MIT AND Zlib)" }, "node_modules/parent-module": { @@ -33953,7 +34338,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "dev": true, "engines": { "node": ">=0.10.0" } @@ -34032,6 +34416,18 @@ "node": ">= 0.10" } }, + "node_modules/pdfjs-dist": { + "version": "5.4.530", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.530.tgz", + "integrity": "sha512-r1hWsSIGGmyYUAHR26zSXkxYWLXLMd6AwqcaFYG9YUZ0GBf5GvcjJSeo512tabM4GYFhxhl5pMCmPr7Q72Rq2Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=20.16.0 || >=22.3.0" + }, + "optionalDependencies": { + "@napi-rs/canvas": "^0.1.84" + } + }, "node_modules/peek-readable": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/peek-readable/-/peek-readable-5.0.0.tgz", @@ -35713,7 +36109,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", - "dev": true, "license": "MIT" }, "node_modules/promise.series": { @@ -38176,7 +38571,6 @@ "version": "1.0.5", "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", - "dev": true, "license": "MIT" }, "node_modules/setprototypeof": { @@ -38441,7 +38835,6 @@ "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==", - "dev": true, "license": "BSD-3-Clause" }, "node_modules/sse.js": { @@ -40249,6 +40642,12 @@ "integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==", "dev": true }, + "node_modules/underscore": { + "version": "1.13.7", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", + "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==", + "license": "MIT" + }, "node_modules/undici": { "version": "7.20.0", "resolved": "https://registry.npmjs.org/undici/-/undici-7.20.0.tgz", @@ -41904,6 +42303,18 @@ } } }, + "node_modules/xlsx": { + "version": "0.20.3", + "resolved": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", + "integrity": "sha512-oLDq3jw7AcLqKWH2AhCpVTZl8mf6X2YReP+Neh0SJUzV/BdZYjth94tG5toiMB1PPrYtxOCfaoUCkvtuH+3AJA==", + "license": "Apache-2.0", + "bin": { + "xlsx": "bin/xlsx.njs" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/xml": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz", diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 82d477e54e96..64fc99b0ebb9 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -820,6 +820,7 @@ export enum OCRStrategy { CUSTOM_OCR = 'custom_ocr', AZURE_MISTRAL_OCR = 'azure_mistral_ocr', VERTEXAI_MISTRAL_OCR = 'vertexai_mistral_ocr', + DOCUMENT_PARSER = 'document_parser', } export enum SearchCategories { diff --git a/packages/data-provider/src/types/files.ts b/packages/data-provider/src/types/files.ts index ec42520bc054..1eb8c200d6d9 100644 --- a/packages/data-provider/src/types/files.ts +++ b/packages/data-provider/src/types/files.ts @@ -13,6 +13,7 @@ export enum FileSources { azure_mistral_ocr = 'azure_mistral_ocr', vertexai_mistral_ocr = 'vertexai_mistral_ocr', text = 'text', + document_parser = 'document_parser', } export const checkOpenAIStorage = (source: string) => From 6224291363fed432bdca53f0b414c659705a3e6f Mon Sep 17 00:00:00 2001 From: Dan Lew Date: Thu, 12 Feb 2026 09:38:44 -0600 Subject: [PATCH 2/9] fix: applied Copilot code review suggestions - Properly calculate length of text based on UTF8. - Avoid issues with loading / blocking PDF parsing. --- .../services/Files/Documents/__tests__/documents.spec.js | 4 ++-- api/server/services/Files/Documents/crud.js | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/api/server/services/Files/Documents/__tests__/documents.spec.js b/api/server/services/Files/Documents/__tests__/documents.spec.js index 55d926c53816..b34d61595cb6 100644 --- a/api/server/services/Files/Documents/__tests__/documents.spec.js +++ b/api/server/services/Files/Documents/__tests__/documents.spec.js @@ -12,7 +12,7 @@ describe('Document Parser', () => { const document = await parseDocument({ file }); expect(document).toEqual({ - bytes: 116, + bytes: 29, filename: 'sample.docx', filepath: 'document_parser', images: [], @@ -30,7 +30,7 @@ describe('Document Parser', () => { const document = await parseDocument({ file }); expect(document).toEqual({ - bytes: 264, + bytes: 66, filename: 'sample.xlsx', filepath: 'document_parser', images: [], diff --git a/api/server/services/Files/Documents/crud.js b/api/server/services/Files/Documents/crud.js index cb6dc7722237..7291ad70e0f1 100644 --- a/api/server/services/Files/Documents/crud.js +++ b/api/server/services/Files/Documents/crud.js @@ -34,7 +34,7 @@ async function parseDocument({ file }) { return { filename: file.filename, - bytes: text.length * 4, + bytes: Buffer.byteLength(text, 'utf8'), filepath: FileSources.document_parser, text, images: [], @@ -49,9 +49,9 @@ async function parseDocument({ file }) { */ async function pdfToText(file) { // Imported inline so that Jest can test other routes without failing due to loading ESM - const { getDocument } = require('pdfjs-dist/legacy/build/pdf.mjs'); + const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs'); - const data = new Uint8Array(fs.readFileSync(file.path)); + const data = new Uint8Array(await fs.promises.readFile(file.path)); const pdf = await getDocument({ data }).promise; // Extract text from all pages From 0a504a84af8bb46f42cbcd86864685ab4e33d05a Mon Sep 17 00:00:00 2001 From: Dan Lew Date: Thu, 12 Feb 2026 09:42:40 -0600 Subject: [PATCH 3/9] fix: improved docs on parseDocument() --- api/server/services/Files/Documents/crud.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/server/services/Files/Documents/crud.js b/api/server/services/Files/Documents/crud.js index 7291ad70e0f1..14239c8394d7 100644 --- a/api/server/services/Files/Documents/crud.js +++ b/api/server/services/Files/Documents/crud.js @@ -4,12 +4,12 @@ const mammoth = require('mammoth'); const XLSX = require('xlsx'); /** - * Retrieves a readable stream for a file from local storage. + * Parses an uploaded document and extracts its text content and metadata. * - * Throws an Error if it fails to parse. + * Throws an Error if it fails to parse or no text is found. * - * @param {Express.Multer.File} file - The file. - * @returns {MistralOCRUploadResult} A readable stream of the file. + * @param {Express.Multer.File} file - The uploaded file to parse. + * @returns {Promise} A readable stream of the file. */ async function parseDocument({ file }) { let text; From cab5826377abc20e9155c126b331dd8271e9177f Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Sun, 22 Feb 2026 09:09:59 -0500 Subject: [PATCH 4/9] chore: move to packages/api for TS support --- api/server/services/Files/strategies.js | 2 +- package-lock.json | 24 +++++-- packages/api/package.json | 8 ++- .../api/src/files/documents/crud.spec.ts | 12 ++-- .../api/src/files/documents/crud.ts | 62 +++++++----------- .../api/src/files/documents}/empty.docx | Bin .../api/src/files/documents}/sample.docx | Bin .../api/src/files/documents}/sample.xlsx | Bin packages/api/src/files/index.ts | 1 + 9 files changed, 59 insertions(+), 50 deletions(-) rename api/server/services/Files/Documents/__tests__/documents.spec.js => packages/api/src/files/documents/crud.spec.ts (90%) rename api/server/services/Files/Documents/crud.js => packages/api/src/files/documents/crud.ts (58%) rename {api/server/services/Files/Documents/__tests__ => packages/api/src/files/documents}/empty.docx (100%) rename {api/server/services/Files/Documents/__tests__ => packages/api/src/files/documents}/sample.docx (100%) rename {api/server/services/Files/Documents/__tests__ => packages/api/src/files/documents}/sample.xlsx (100%) diff --git a/api/server/services/Files/strategies.js b/api/server/services/Files/strategies.js index 0f669824a035..25341b571563 100644 --- a/api/server/services/Files/strategies.js +++ b/api/server/services/Files/strategies.js @@ -1,5 +1,6 @@ const { FileSources } = require('librechat-data-provider'); const { + parseDocument, uploadMistralOCR, uploadAzureMistralOCR, uploadGoogleVertexMistralOCR, @@ -51,7 +52,6 @@ const { const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI'); const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code'); const { uploadVectors, deleteVectors } = require('./VectorDB'); -const { parseDocument } = require('~/server/services/Files/Documents/crud'); /** * Firebase Storage Strategy Functions diff --git a/package-lock.json b/package-lock.json index cd6cc27138b2..04f8251dd6ae 100644 --- a/package-lock.json +++ b/package-lock.json @@ -33419,6 +33419,13 @@ "integrity": "sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==", "dev": true }, + "node_modules/node-readable-to-web-readable-stream": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/node-readable-to-web-readable-stream/-/node-readable-to-web-readable-stream-0.4.2.tgz", + "integrity": "sha512-/cMZNI34v//jUTrI+UIo4ieHAB5EZRY/+7OmXZgBxaWBMcW2tGdceIw06RFxWxrKZ5Jp3sI2i5TsRo+CBhtVLQ==", + "license": "MIT", + "optional": true + }, "node_modules/node-releases": { "version": "2.0.27", "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz", @@ -34417,15 +34424,16 @@ } }, "node_modules/pdfjs-dist": { - "version": "5.4.530", - "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.530.tgz", - "integrity": "sha512-r1hWsSIGGmyYUAHR26zSXkxYWLXLMd6AwqcaFYG9YUZ0GBf5GvcjJSeo512tabM4GYFhxhl5pMCmPr7Q72Rq2Q==", + "version": "5.4.624", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.624.tgz", + "integrity": "sha512-sm6TxKTtWv1Oh6n3C6J6a8odejb5uO4A4zo/2dgkHuC0iu8ZMAXOezEODkVaoVp8nX1Xzr+0WxFJJmUr45hQzg==", "license": "Apache-2.0", "engines": { "node": ">=20.16.0 || >=22.3.0" }, "optionalDependencies": { - "@napi-rs/canvas": "^0.1.84" + "@napi-rs/canvas": "^0.1.88", + "node-readable-to-web-readable-stream": "^0.4.2" } }, "node_modules/peek-readable": { @@ -42582,12 +42590,15 @@ "jest": "^30.2.0", "jest-junit": "^16.0.0", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mongodb": "^6.14.2", + "pdfjs-dist": "^5.4.624", "rimraf": "^6.1.2", "rollup": "^4.22.4", "rollup-plugin-peer-deps-external": "^2.2.4", "ts-node": "^10.9.2", - "typescript": "^5.0.4" + "typescript": "^5.0.4", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz" }, "peerDependencies": { "@anthropic-ai/vertex-sdk": "^0.14.3", @@ -42618,13 +42629,16 @@ "keyv": "^5.3.2", "keyv-file": "^5.1.2", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "memorystore": "^1.6.7", "mongoose": "^8.12.1", "node-fetch": "2.7.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "tiktoken": "^1.0.15", "undici": "^7.18.2", + "xlsx": "*", "zod": "^3.22.4" } }, diff --git a/packages/api/package.json b/packages/api/package.json index 67cb5df81673..6aae71605ca7 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -67,12 +67,15 @@ "jest": "^30.2.0", "jest-junit": "^16.0.0", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mongodb": "^6.14.2", + "pdfjs-dist": "^5.4.624", "rimraf": "^6.1.2", "rollup": "^4.22.4", "rollup-plugin-peer-deps-external": "^2.2.4", "ts-node": "^10.9.2", - "typescript": "^5.0.4" + "typescript": "^5.0.4", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz" }, "publishConfig": { "registry": "https://registry.npmjs.org/" @@ -106,13 +109,16 @@ "keyv": "^5.3.2", "keyv-file": "^5.1.2", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "memorystore": "^1.6.7", "mongoose": "^8.12.1", "node-fetch": "2.7.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "tiktoken": "^1.0.15", "undici": "^7.18.2", + "xlsx": "*", "zod": "^3.22.4" } } diff --git a/api/server/services/Files/Documents/__tests__/documents.spec.js b/packages/api/src/files/documents/crud.spec.ts similarity index 90% rename from api/server/services/Files/Documents/__tests__/documents.spec.js rename to packages/api/src/files/documents/crud.spec.ts index b34d61595cb6..5f1508d39944 100644 --- a/api/server/services/Files/Documents/__tests__/documents.spec.js +++ b/packages/api/src/files/documents/crud.spec.ts @@ -1,5 +1,5 @@ -const path = require('path'); -const { parseDocument } = require('~/server/services/Files/Documents/crud'); +import path from 'path'; +import { parseDocument } from './crud'; describe('Document Parser', () => { test('parseDocument() parses text from docx', async () => { @@ -7,7 +7,7 @@ describe('Document Parser', () => { filename: 'sample.docx', path: path.join(__dirname, 'sample.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - }; + } as Express.Multer.File; const document = await parseDocument({ file }); @@ -25,7 +25,7 @@ describe('Document Parser', () => { filename: 'sample.xlsx', path: path.join(__dirname, 'sample.xlsx'), mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - }; + } as Express.Multer.File; const document = await parseDocument({ file }); @@ -43,7 +43,7 @@ describe('Document Parser', () => { filename: 'nonexistent.file', path: path.join(__dirname, 'nonexistent.file'), mimetype: 'application/invalid', - }; + } as Express.Multer.File; await expect(parseDocument({ file })).rejects.toThrow( 'Unsupported file type in document parser: application/invalid', @@ -55,7 +55,7 @@ describe('Document Parser', () => { filename: 'empty.docx', path: path.join(__dirname, 'empty.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - }; + } as Express.Multer.File; await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); }); diff --git a/api/server/services/Files/Documents/crud.js b/packages/api/src/files/documents/crud.ts similarity index 58% rename from api/server/services/Files/Documents/crud.js rename to packages/api/src/files/documents/crud.ts index 14239c8394d7..469a44ff1def 100644 --- a/api/server/services/Files/Documents/crud.js +++ b/packages/api/src/files/documents/crud.ts @@ -1,18 +1,21 @@ -const fs = require('fs'); -const { FileSources } = require('librechat-data-provider'); -const mammoth = require('mammoth'); -const XLSX = require('xlsx'); +import * as fs from 'fs'; +import mammoth from 'mammoth'; +import * as XLSX from 'xlsx'; +import { FileSources } from 'librechat-data-provider'; +import type { TextItem } from 'pdfjs-dist/types/src/display/api'; +import type { MistralOCRUploadResult } from '~/types'; /** * Parses an uploaded document and extracts its text content and metadata. * * Throws an Error if it fails to parse or no text is found. - * - * @param {Express.Multer.File} file - The uploaded file to parse. - * @returns {Promise} A readable stream of the file. */ -async function parseDocument({ file }) { - let text; +export async function parseDocument({ + file, +}: { + file: Express.Multer.File; +}): Promise { + let text: string; switch (file.mimetype) { case 'application/pdf': text = await pdfToText(file); @@ -29,7 +32,7 @@ async function parseDocument({ file }) { } if (!text?.trim()) { - throw Error('No text found in document'); + throw new Error('No text found in document'); } return { @@ -41,59 +44,44 @@ async function parseDocument({ file }) { }; } -/** - * Parses PDF, returns text inside. - * - * @param {Express.Multer.File} file - The file. - * @returns {Promise} the text contents of the PDF. - */ -async function pdfToText(file) { +/** Parses PDF, returns text inside. */ +async function pdfToText(file: Express.Multer.File): Promise { // Imported inline so that Jest can test other routes without failing due to loading ESM const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs'); const data = new Uint8Array(await fs.promises.readFile(file.path)); const pdf = await getDocument({ data }).promise; - // Extract text from all pages let fullText = ''; for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const textContent = await page.getTextContent(); - const pageText = textContent.items.map((item) => item.str).join(' '); + const pageText = textContent.items + .filter((item): item is TextItem => !('type' in item)) + .map((item) => item.str) + .join(' '); fullText += pageText + '\n'; } return fullText; } -/** - * Parses Word document, returns text inside. - * - * @param {Express.Multer.File} file - The file. - * @returns {Promise} the text contents of the Word document. - */ -async function wordDocToText(file) { +/** Parses Word document, returns text inside. */ +async function wordDocToText(file: Express.Multer.File): Promise { const rawText = await mammoth.extractRawText({ path: file.path }); return rawText.value; } -/** - * Parses Excel sheet, returns text inside. - * - * @param {Express.Multer.File} file - The file. - * @returns {string} the text contents of the XLS/XLSX. - */ -function excelSheetToText(file) { +/** Parses Excel sheet, returns text inside. */ +function excelSheetToText(file: Express.Multer.File): string { const workbook = XLSX.readFile(file.path); let text = ''; - workbook.SheetNames.forEach((sheetName) => { + for (const sheetName of workbook.SheetNames) { const worksheet = workbook.Sheets[sheetName]; const worksheetAsCsvString = XLSX.utils.sheet_to_csv(worksheet); text += `${sheetName}:\n${worksheetAsCsvString}\n`; - }); + } return text; } - -module.exports = { parseDocument }; diff --git a/api/server/services/Files/Documents/__tests__/empty.docx b/packages/api/src/files/documents/empty.docx similarity index 100% rename from api/server/services/Files/Documents/__tests__/empty.docx rename to packages/api/src/files/documents/empty.docx diff --git a/api/server/services/Files/Documents/__tests__/sample.docx b/packages/api/src/files/documents/sample.docx similarity index 100% rename from api/server/services/Files/Documents/__tests__/sample.docx rename to packages/api/src/files/documents/sample.docx diff --git a/api/server/services/Files/Documents/__tests__/sample.xlsx b/packages/api/src/files/documents/sample.xlsx similarity index 100% rename from api/server/services/Files/Documents/__tests__/sample.xlsx rename to packages/api/src/files/documents/sample.xlsx diff --git a/packages/api/src/files/index.ts b/packages/api/src/files/index.ts index 3aedc5ba9d5c..707f2ef7fb8a 100644 --- a/packages/api/src/files/index.ts +++ b/packages/api/src/files/index.ts @@ -1,5 +1,6 @@ export * from './audio'; export * from './context'; +export * from './documents/crud'; export * from './encode'; export * from './filter'; export * from './mistral/crud'; From f91a40277e8b06bb5912e496f0620038d754f65a Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Sun, 22 Feb 2026 11:44:52 -0500 Subject: [PATCH 5/9] refactor: make document processing the default ocr strategy - Introduced support for additional document types in the OCR strategy, including PDF, DOCX, and XLS/XLSX. - Updated the file upload handling to dynamically select the appropriate parsing strategy based on the file type. - Refactored the document parsing functions to use asynchronous imports for improved performance and maintainability. --- api/server/services/Files/process.js | 23 ++++++++++++++++++----- packages/api/src/files/documents/crud.ts | 14 +++++++------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/api/server/services/Files/process.js b/api/server/services/Files/process.js index 30b47f2e52b7..379ff3451058 100644 --- a/api/server/services/Files/process.js +++ b/api/server/services/Files/process.js @@ -553,17 +553,30 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { const fileConfig = mergeFileConfig(appConfig.fileConfig); - const shouldUseOCR = + const documentParserMimeTypes = [ + 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.ms-excel', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + ]; + + const shouldUseConfiguredOCR = appConfig?.ocr != null && fileConfig.checkType(file.mimetype, fileConfig.ocr?.supportedMimeTypes || []); - if (shouldUseOCR && !(await checkCapability(req, AgentCapabilities.ocr))) { + const shouldUseDocumentParser = + !shouldUseConfiguredOCR && documentParserMimeTypes.includes(file.mimetype); + + const shouldUseOCR = shouldUseConfiguredOCR || shouldUseDocumentParser; + + if (shouldUseConfiguredOCR && !(await checkCapability(req, AgentCapabilities.ocr))) { throw new Error('OCR capability is not enabled for Agents'); } else if (shouldUseOCR) { try { - const { handleFileUpload: uploadOCR } = getStrategyFunctions( - appConfig?.ocr?.strategy ?? FileSources.mistral_ocr, - ); + const ocrStrategy = shouldUseConfiguredOCR + ? (appConfig?.ocr?.strategy ?? FileSources.document_parser) + : FileSources.document_parser; + const { handleFileUpload: uploadOCR } = getStrategyFunctions(ocrStrategy); const { text, bytes, diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts index 469a44ff1def..de5d4e2f17bd 100644 --- a/packages/api/src/files/documents/crud.ts +++ b/packages/api/src/files/documents/crud.ts @@ -1,6 +1,4 @@ import * as fs from 'fs'; -import mammoth from 'mammoth'; -import * as XLSX from 'xlsx'; import { FileSources } from 'librechat-data-provider'; import type { TextItem } from 'pdfjs-dist/types/src/display/api'; import type { MistralOCRUploadResult } from '~/types'; @@ -25,7 +23,7 @@ export async function parseDocument({ break; case 'application/vnd.ms-excel': case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': - text = excelSheetToText(file); + text = await excelSheetToText(file); break; default: throw new Error(`Unsupported file type in document parser: ${file.mimetype}`); @@ -68,18 +66,20 @@ async function pdfToText(file: Express.Multer.File): Promise { /** Parses Word document, returns text inside. */ async function wordDocToText(file: Express.Multer.File): Promise { - const rawText = await mammoth.extractRawText({ path: file.path }); + const { extractRawText } = await import('mammoth'); + const rawText = await extractRawText({ path: file.path }); return rawText.value; } /** Parses Excel sheet, returns text inside. */ -function excelSheetToText(file: Express.Multer.File): string { - const workbook = XLSX.readFile(file.path); +async function excelSheetToText(file: Express.Multer.File): Promise { + const { readFile, utils } = await import('xlsx'); + const workbook = readFile(file.path); let text = ''; for (const sheetName of workbook.SheetNames) { const worksheet = workbook.Sheets[sheetName]; - const worksheetAsCsvString = XLSX.utils.sheet_to_csv(worksheet); + const worksheetAsCsvString = utils.sheet_to_csv(worksheet); text += `${sheetName}:\n${worksheetAsCsvString}\n`; } From d407655ae6f1576ba1d86f6b610530bc99b902a6 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Sun, 22 Feb 2026 11:45:13 -0500 Subject: [PATCH 6/9] test: add unit tests for processAgentFileUpload functionality - Introduced a new test suite for the processAgentFileUpload function in process.spec.js. - Implemented various test cases to validate OCR strategy selection based on file types, including PDF, DOCX, XLSX, and XLS. - Mocked dependencies to ensure isolated testing of file upload handling and strategy selection logic. - Enhanced coverage for scenarios involving OCR capability checks and default strategy fallbacks. --- api/server/services/Files/process.spec.js | 231 ++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 api/server/services/Files/process.spec.js diff --git a/api/server/services/Files/process.spec.js b/api/server/services/Files/process.spec.js new file mode 100644 index 000000000000..e1b2641aba8f --- /dev/null +++ b/api/server/services/Files/process.spec.js @@ -0,0 +1,231 @@ +jest.mock('uuid', () => ({ v4: jest.fn(() => 'mock-uuid') })); + +jest.mock('@librechat/data-schemas', () => ({ + logger: { warn: jest.fn(), debug: jest.fn(), error: jest.fn() }, +})); + +jest.mock('@librechat/agents', () => ({ + EnvVar: { CODE_API_KEY: 'CODE_API_KEY' }, +})); + +jest.mock('@librechat/api', () => ({ + sanitizeFilename: jest.fn((n) => n), + parseText: jest.fn().mockResolvedValue({ text: '', bytes: 0 }), + processAudioFile: jest.fn(), +})); + +jest.mock('librechat-data-provider', () => ({ + ...jest.requireActual('librechat-data-provider'), + mergeFileConfig: jest.fn(), +})); + +jest.mock('~/server/services/Files/images', () => ({ + convertImage: jest.fn(), + resizeAndConvert: jest.fn(), + resizeImageBuffer: jest.fn(), +})); + +jest.mock('~/server/controllers/assistants/v2', () => ({ + addResourceFileId: jest.fn(), + deleteResourceFileId: jest.fn(), +})); + +jest.mock('~/models/Agent', () => ({ + addAgentResourceFile: jest.fn().mockResolvedValue({}), + removeAgentResourceFiles: jest.fn(), +})); + +jest.mock('~/server/controllers/assistants/helpers', () => ({ + getOpenAIClient: jest.fn(), +})); + +jest.mock('~/server/services/Tools/credentials', () => ({ + loadAuthValues: jest.fn(), +})); + +jest.mock('~/models', () => ({ + createFile: jest.fn().mockResolvedValue({ file_id: 'created-file-id' }), + updateFileUsage: jest.fn(), + deleteFiles: jest.fn(), +})); + +jest.mock('~/server/utils/getFileStrategy', () => ({ + getFileStrategy: jest.fn().mockReturnValue('local'), +})); + +jest.mock('~/server/services/Config', () => ({ + checkCapability: jest.fn().mockResolvedValue(true), +})); + +jest.mock('~/server/utils/queue', () => ({ + LB_QueueAsyncCall: jest.fn(), +})); + +jest.mock('~/server/services/Files/strategies', () => ({ + getStrategyFunctions: jest.fn(), +})); + +jest.mock('~/server/utils', () => ({ + determineFileType: jest.fn(), +})); + +jest.mock('~/server/services/Files/Audio/STTService', () => ({ + STTService: { getInstance: jest.fn() }, +})); + +const { EToolResources, FileSources, AgentCapabilities } = require('librechat-data-provider'); +const { mergeFileConfig } = require('librechat-data-provider'); +const { checkCapability } = require('~/server/services/Config'); +const { getStrategyFunctions } = require('~/server/services/Files/strategies'); +const { processAgentFileUpload } = require('./process'); + +const PDF_MIME = 'application/pdf'; +const DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; +const XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'; +const XLS_MIME = 'application/vnd.ms-excel'; + +const makeReq = ({ mimetype = PDF_MIME, ocrConfig = null } = {}) => ({ + user: { id: 'user-123' }, + file: { + path: '/tmp/upload.bin', + originalname: 'upload.bin', + filename: 'upload-uuid.bin', + mimetype, + }, + body: { model: 'gpt-4o' }, + config: { + fileConfig: {}, + fileStrategy: 'local', + ocr: ocrConfig, + }, +}); + +const makeMetadata = () => ({ + agent_id: 'agent-abc', + tool_resource: EToolResources.context, + file_id: 'file-uuid-123', +}); + +const mockRes = { + status: jest.fn().mockReturnThis(), + json: jest.fn().mockReturnValue({}), +}; + +const makeFileConfig = ({ ocrSupportedMimeTypes = [] } = {}) => ({ + checkType: (mime, types) => (types ?? []).includes(mime), + ocr: { supportedMimeTypes: ocrSupportedMimeTypes }, + stt: { supportedMimeTypes: [] }, + text: { supportedMimeTypes: [] }, +}); + +describe('processAgentFileUpload', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockRes.status.mockReturnThis(); + mockRes.json.mockReturnValue({}); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest + .fn() + .mockResolvedValue({ text: 'extracted text', bytes: 42, filepath: 'doc://result' }), + }); + mergeFileConfig.mockReturnValue(makeFileConfig()); + }); + + describe('OCR strategy selection', () => { + test.each([ + ['PDF', PDF_MIME], + ['DOCX', DOCX_MIME], + ['XLSX', XLSX_MIME], + ['XLS', XLS_MIME], + ])('uses document_parser automatically for %s when no OCR is configured', async (_, mime) => { + mergeFileConfig.mockReturnValue(makeFileConfig()); + const req = makeReq({ mimetype: mime, ocrConfig: null }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('does not check OCR capability when using automatic document_parser fallback', async () => { + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).not.toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('uses the configured OCR strategy when OCR is set up for the file type', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.mistral_ocr); + }); + + test('uses document_parser as default when OCR is configured but no strategy is specified', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { supportedMimeTypes: [PDF_MIME] }, + }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('throws when configured OCR capability is not enabled for the agent', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + checkCapability.mockResolvedValue(false); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow('OCR capability is not enabled for Agents'); + }); + + test('uses document_parser (no capability check) when OCR capability returns false but no OCR config', async () => { + checkCapability.mockResolvedValue(false); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).not.toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('uses document_parser when OCR is configured but the file type is not in OCR supported types', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const req = makeReq({ + mimetype: DOCX_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).not.toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + expect(getStrategyFunctions).not.toHaveBeenCalledWith(FileSources.mistral_ocr); + }); + + test('does not invoke any OCR strategy for unsupported MIME types without OCR config', async () => { + const req = makeReq({ mimetype: 'text/plain', ocrConfig: null }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow('File type text/plain is not supported for text parsing.'); + + expect(getStrategyFunctions).not.toHaveBeenCalled(); + }); + }); +}); From dc33556fac5df158da31be70822dd2e6d5819dd2 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Sun, 22 Feb 2026 13:13:45 -0500 Subject: [PATCH 7/9] chore: update pdfjs-dist version and enhance document parsing tests - Bumped pdfjs-dist dependency to version 5.4.624 in both api and packages/api. - Refactored document parsing tests to use 'originalname' instead of 'filename' for file objects. - Added a new test case for parsing XLS files to improve coverage of document types supported by the parser. - Introduced a sample XLS file for testing purposes. --- api/package.json | 2 +- packages/api/package.json | 3 +- packages/api/src/files/documents/crud.spec.ts | 26 +++++++++++++++--- packages/api/src/files/documents/crud.ts | 2 +- packages/api/src/files/documents/sample.xls | Bin 0 -> 3584 bytes 5 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 packages/api/src/files/documents/sample.xls diff --git a/api/package.json b/api/package.json index 794457921a66..9951b6f01a1c 100644 --- a/api/package.json +++ b/api/package.json @@ -103,7 +103,7 @@ "passport-jwt": "^4.0.1", "passport-ldapauth": "^3.0.1", "passport-local": "^1.0.0", - "pdfjs-dist": "^5.4.530", + "pdfjs-dist": "^5.4.624", "rate-limit-redis": "^4.2.0", "sharp": "^0.33.5", "tiktoken": "^1.0.15", diff --git a/packages/api/package.json b/packages/api/package.json index 6aae71605ca7..6df880e0bfee 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -114,11 +114,10 @@ "memorystore": "^1.6.7", "mongoose": "^8.12.1", "node-fetch": "2.7.0", - "pdfjs-dist": "^5.4.530", + "pdfjs-dist": "^5.4.624", "rate-limit-redis": "^4.2.0", "tiktoken": "^1.0.15", "undici": "^7.18.2", - "xlsx": "*", "zod": "^3.22.4" } } diff --git a/packages/api/src/files/documents/crud.spec.ts b/packages/api/src/files/documents/crud.spec.ts index 5f1508d39944..3b9e1636ef79 100644 --- a/packages/api/src/files/documents/crud.spec.ts +++ b/packages/api/src/files/documents/crud.spec.ts @@ -4,7 +4,7 @@ import { parseDocument } from './crud'; describe('Document Parser', () => { test('parseDocument() parses text from docx', async () => { const file = { - filename: 'sample.docx', + originalname: 'sample.docx', path: path.join(__dirname, 'sample.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', } as Express.Multer.File; @@ -22,7 +22,7 @@ describe('Document Parser', () => { test('parseDocument() parses text from xlsx', async () => { const file = { - filename: 'sample.xlsx', + originalname: 'sample.xlsx', path: path.join(__dirname, 'sample.xlsx'), mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', } as Express.Multer.File; @@ -38,9 +38,27 @@ describe('Document Parser', () => { }); }); + test('parseDocument() parses text from xls', async () => { + const file = { + originalname: 'sample.xls', + path: path.join(__dirname, 'sample.xls'), + mimetype: 'application/vnd.ms-excel', + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 31, + filename: 'sample.xls', + filepath: 'document_parser', + images: [], + text: 'Sheet One:\nData,on,first,sheet\n', + }); + }); + test('parseDocument() throws error for unhandled document type', async () => { const file = { - filename: 'nonexistent.file', + originalname: 'nonexistent.file', path: path.join(__dirname, 'nonexistent.file'), mimetype: 'application/invalid', } as Express.Multer.File; @@ -52,7 +70,7 @@ describe('Document Parser', () => { test('parseDocument() throws error for empty document', async () => { const file = { - filename: 'empty.docx', + originalname: 'empty.docx', path: path.join(__dirname, 'empty.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', } as Express.Multer.File; diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts index de5d4e2f17bd..f2d45644d4f3 100644 --- a/packages/api/src/files/documents/crud.ts +++ b/packages/api/src/files/documents/crud.ts @@ -34,7 +34,7 @@ export async function parseDocument({ } return { - filename: file.filename, + filename: file.originalname, bytes: Buffer.byteLength(text, 'utf8'), filepath: FileSources.document_parser, text, diff --git a/packages/api/src/files/documents/sample.xls b/packages/api/src/files/documents/sample.xls new file mode 100644 index 0000000000000000000000000000000000000000..d5976b0816029f46988fe13ed6d920278a7d599e GIT binary patch literal 3584 zcmeHK&1+LZ5dZDVOZt-9HXjyi52d_|qJr8`?WMJr9!fzi%|Q^sHfS61qmt(0t$y7U zym<5C(UW>n5W)Bdh|;5{)~g;~q2Q%9{$^gbjYTl2AR0Q$Z{F@?_szc9*_p}552>Xm zFS?hpg@zEpdL@onjoh&B*l`k&v0%MYsaP?)JM~2= z3%Sz~!k)j}`4++>;zWXIC3X>QL_5(zFa#h)q=^i%o9HB()lSX$U6VYoKOq-UW5b=~ zf_g}NZ9xJJ$smUUvTP-cqdycjz|(M4zxc51qu!q2KSha#>Z&*vPUjgr-GZBWm8 zvD2aI_t!K3cXO=nM!w&^^LXE}KHJ~UGwv}8#G=R--?zle1~CSXWmXB#-FI3meCA8f zFkUfLPl%}v(cgBrqv%GUD;`#_Le!k3cU{IbuA;~7#{jxud?Zef8VSuGG>sYMp8L7K9+Ou!y)(S_iZ>45Nt z(X%2+mr2-TGWPZm-XK!0#TV-MaZ`krHCS=FKn7-Ewq%5)a>Odqo!f^-OV86=afmmoOIY?}* zlgK;dr14V!+YHxCeQi2f@ciN!XT1z&MQ@cNx&aiteS{T;MHELyD=90E=tz#`U!)&G CdJo0` literal 0 HcmV?d00001 From 2d40340fe71b07fe060bf25421a1bdc49605a056 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Sun, 22 Feb 2026 14:00:38 -0500 Subject: [PATCH 8/9] feat: enforce text size limit and improve OCR fallback handling in processAgentFileUpload - Added a check to ensure extracted text does not exceed the 15MB storage limit, throwing an error if it does. - Refactored the OCR handling logic to improve fallback behavior when the configured OCR fails, ensuring a more robust document processing flow. - Enhanced unit tests to cover scenarios for oversized text and fallback mechanisms, ensuring proper error handling and functionality. --- api/server/services/Files/process.js | 55 ++++++++++---- api/server/services/Files/process.spec.js | 92 +++++++++++++++++++++++ 2 files changed, 131 insertions(+), 16 deletions(-) diff --git a/api/server/services/Files/process.js b/api/server/services/Files/process.js index 379ff3451058..d69be6a00c7c 100644 --- a/api/server/services/Files/process.js +++ b/api/server/services/Files/process.js @@ -523,6 +523,12 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { * @return {Promise} */ const createTextFile = async ({ text, bytes, filepath, type = 'text/plain' }) => { + const textBytes = Buffer.byteLength(text, 'utf8'); + if (textBytes > 15 * megabyte) { + throw new Error( + `Extracted text from "${file.originalname}" exceeds the 15MB storage limit (${Math.round(textBytes / megabyte)}MB). Try a shorter document.`, + ); + } const fileInfo = removeNullishValues({ text, bytes, @@ -569,26 +575,43 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { const shouldUseOCR = shouldUseConfiguredOCR || shouldUseDocumentParser; - if (shouldUseConfiguredOCR && !(await checkCapability(req, AgentCapabilities.ocr))) { - throw new Error('OCR capability is not enabled for Agents'); - } else if (shouldUseOCR) { + const resolveDocumentText = async () => { + if (shouldUseConfiguredOCR) { + try { + const ocrStrategy = appConfig?.ocr?.strategy ?? FileSources.document_parser; + const { handleFileUpload } = getStrategyFunctions(ocrStrategy); + return await handleFileUpload({ req, file, loadAuthValues }); + } catch (err) { + logger.error( + `[processAgentFileUpload] Configured OCR failed for "${file.originalname}", falling back to document_parser:`, + err, + ); + } + } try { - const ocrStrategy = shouldUseConfiguredOCR - ? (appConfig?.ocr?.strategy ?? FileSources.document_parser) - : FileSources.document_parser; - const { handleFileUpload: uploadOCR } = getStrategyFunctions(ocrStrategy); - const { - text, - bytes, - filepath: ocrFileURL, - } = await uploadOCR({ req, file, loadAuthValues }); - return await createTextFile({ text, bytes, filepath: ocrFileURL }); - } catch (ocrError) { + const { handleFileUpload } = getStrategyFunctions(FileSources.document_parser); + return await handleFileUpload({ req, file, loadAuthValues }); + } catch (err) { logger.error( - `[processAgentFileUpload] OCR processing failed for file "${file.originalname}", falling back to text extraction:`, - ocrError, + `[processAgentFileUpload] Document parser failed for "${file.originalname}":`, + err, ); } + }; + + if (shouldUseConfiguredOCR && !(await checkCapability(req, AgentCapabilities.ocr))) { + throw new Error('OCR capability is not enabled for Agents'); + } + + if (shouldUseOCR) { + const ocrResult = await resolveDocumentText(); + if (ocrResult) { + const { text, bytes, filepath: ocrFileURL } = ocrResult; + return await createTextFile({ text, bytes, filepath: ocrFileURL }); + } + throw new Error( + `Unable to extract text from "${file.originalname}". The document may be image-based and requires an OCR service to process.`, + ); } const shouldUseSTT = fileConfig.checkType( diff --git a/api/server/services/Files/process.spec.js b/api/server/services/Files/process.spec.js index e1b2641aba8f..2938391ff258 100644 --- a/api/server/services/Files/process.spec.js +++ b/api/server/services/Files/process.spec.js @@ -123,6 +123,7 @@ describe('processAgentFileUpload', () => { jest.clearAllMocks(); mockRes.status.mockReturnThis(); mockRes.json.mockReturnValue({}); + checkCapability.mockResolvedValue(true); getStrategyFunctions.mockReturnValue({ handleFileUpload: jest .fn() @@ -227,5 +228,96 @@ describe('processAgentFileUpload', () => { expect(getStrategyFunctions).not.toHaveBeenCalled(); }); + + test('throws instead of falling back to parseText when document_parser fails for a document MIME type', async () => { + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockRejectedValue(new Error('No text found in document')), + }); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + const { parseText } = require('@librechat/api'); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow(/image-based and requires an OCR service/); + + expect(parseText).not.toHaveBeenCalled(); + }); + + test('falls back to document_parser when configured OCR fails for a document MIME type', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const failingUpload = jest.fn().mockRejectedValue(new Error('OCR API returned 500')); + const fallbackUpload = jest + .fn() + .mockResolvedValue({ text: 'parsed text', bytes: 11, filepath: 'doc://result' }); + getStrategyFunctions + .mockReturnValueOnce({ handleFileUpload: failingUpload }) + .mockReturnValueOnce({ handleFileUpload: fallbackUpload }); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).resolves.not.toThrow(); + + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.mistral_ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('throws when both configured OCR and document_parser fallback fail', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockRejectedValue(new Error('failure')), + }); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + const { parseText } = require('@librechat/api'); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow(/image-based and requires an OCR service/); + + expect(parseText).not.toHaveBeenCalled(); + }); + }); + + describe('text size guard', () => { + test('throws before writing to MongoDB when extracted text exceeds 15MB', async () => { + const oversizedText = 'x'.repeat(15 * 1024 * 1024 + 1); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockResolvedValue({ + text: oversizedText, + bytes: Buffer.byteLength(oversizedText, 'utf8'), + filepath: 'doc://result', + }), + }); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + const { createFile } = require('~/models'); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow(/exceeds the 15MB storage limit/); + + expect(createFile).not.toHaveBeenCalled(); + }); + + test('succeeds when extracted text is within the 15MB limit', async () => { + const okText = 'x'.repeat(1024); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockResolvedValue({ + text: okText, + bytes: Buffer.byteLength(okText, 'utf8'), + filepath: 'doc://result', + }), + }); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).resolves.not.toThrow(); + }); }); }); From 9c6d2c48e3ef6a08c0b6ecfc5a967c7887f74f25 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Sun, 22 Feb 2026 14:10:39 -0500 Subject: [PATCH 9/9] fix: correct OCR URL construction in performOCR function - Updated the OCR URL construction to ensure it correctly appends '/ocr' to the base URL if not already present, improving the reliability of the OCR request. --- packages/api/src/files/mistral/crud.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/api/src/files/mistral/crud.ts b/packages/api/src/files/mistral/crud.ts index fefe4a4675d2..c818fab8b884 100644 --- a/packages/api/src/files/mistral/crud.ts +++ b/packages/api/src/files/mistral/crud.ts @@ -165,9 +165,11 @@ export async function performOCR({ config.httpsAgent = new HttpsProxyAgent(process.env.PROXY); } + const ocrURL = baseURL.endsWith('/ocr') ? baseURL : `${baseURL}/ocr`; + return axios .post( - `${baseURL}/ocr`, + ocrURL, { model, image_limit: 0,