@@ -65,39 +65,42 @@ def create(
6565 extra_body : Body | None = None ,
6666 timeout : float | httpx .Timeout | None | NotGiven = NOT_GIVEN ,
6767 ) -> ParseCreateResponse :
68- """Parse a file into a structured Markdown representation .
68+ """Parse a file into a structured Markdown and/or JSON .
6969
70- The file size must be
71- less than 100MB and the number of pages must be less than 400.
70+ Files must be less than
71+ 100MB and 400 pages. We use LibreOffice to convert DOC(X) and PPT(X) files to
72+ PDF, which may affect page count.
7273
74+ See our [blog post](https://contextual.ai/blog/document-parser-for-rag) and
75+ [code examples](https://github.com/ContextualAI/examples/blob/main/03-standalone-api/04-parse/parse.ipynb).
73767477 any feedback or questions.
7578
7679 Args:
7780 raw_file: The file to be parsed. The file type must be PDF, DOC / DOCX, PPT / PPTX.
7881
79- enable_document_hierarchy: Controls parsing heading levels (e.g. H1, H2, H3) at higher quality. Adds a
80- table of contents to the output with the structure of the entire parsed
81- document . Not permitted in ' basic' parsing_mode, or if page_range is not
82- continuous and/or does not start from page zero.
82+ enable_document_hierarchy: Adds a table of contents to the output with the structure of the entire parsed
83+ document. This feature is in beta. Controls parsing heading levels (e.g. H1, H2,
84+ H3) at higher quality . Not permitted in ` basic` parsing_mode, or if page_range
85+ is not continuous and/or does not start from page zero.
8386
8487 enable_split_tables: Controls whether tables are split into multiple tables by row with the headers
8588 propagated. Use for improving LLM comprehension of very large tables. Not
86- permitted in ' basic' parsing_mode.
89+ permitted in ` basic` parsing_mode.
8790
88- figure_caption_mode: Controls how thorough figure captions are. ' concise' is short and minimizes
89- chances of hallucinations. ' detailed' is more thorough and can include
90- commentary. Not permitted in ' basic' parsing_mode.
91+ figure_caption_mode: Controls how thorough figure captions are. ` concise` is short and minimizes
92+ chances of hallucinations. ` detailed` is more thorough and can include
93+ commentary; this mode is in beta . Not permitted in ` basic` parsing_mode.
9194
9295 max_split_table_cells: Threshold number of table cells beyond which large tables are split if
93- `enable_split_tables` is True. Not permitted in ' basic' parsing_mode.
96+ `enable_split_tables` is True. Not permitted in ` basic` parsing_mode.
9497
9598 page_range: Optional string representing page range to be parsed. Format: comma-separated
96- indexes (0-based) e.g. ' 0,1,2,5,6' or ranges ( inclusive of both ends) e.g.
97- ' 0-2,5,6'
99+ indexes (0-based, e.g. ` 0,1,2,5,6`), or ranges inclusive of both ends ( e.g.
100+ ` 0-2,5,6`)
98101
99- parse_mode: The settings to use for parsing. ' basic' is for simple, text-only documents.
100- ' standard' is for complex documents with images, complex hierarchy, and/or no
102+ parse_mode: The settings to use for parsing. ` basic` is for simple, text-only documents.
103+ ` standard` is for complex documents with images, complex hierarchy, and/or no
101104 natively encoded textual data (e.g. for scanned documents).
102105
103106 extra_headers: Send extra headers
@@ -156,11 +159,11 @@ def job_results(
156159 job_id: Unique ID of the parse job
157160
158161 output_types: The desired output format(s) of the parsed file. Must be `markdown-document`,
159- `markdown-per-page`, and/or `blocks-per-page`. `markdown-document` parses the
160- whole document into a single concatenated markdown output. `markdown-per-page`
161- provides markdown output per page. `blocks-per-page` provides a structured JSON
162+ `markdown-per-page`, and/or `blocks-per-page`. Specify multiple values to get
163+ multiple formats in the response. `markdown-document` parses the whole document
164+ into a single concatenated markdown output. `markdown-per-page` provides
165+ markdown output per page. `blocks-per-page` provides a structured JSON
162166 representation of the content blocks on each page, sorted by reading order.
163- Specify multiple values to get multiple formats in the response.
164167
165168 extra_headers: Send extra headers
166169
@@ -298,39 +301,42 @@ async def create(
298301 extra_body : Body | None = None ,
299302 timeout : float | httpx .Timeout | None | NotGiven = NOT_GIVEN ,
300303 ) -> ParseCreateResponse :
301- """Parse a file into a structured Markdown representation .
304+ """Parse a file into a structured Markdown and/or JSON .
302305
303- The file size must be
304- less than 100MB and the number of pages must be less than 400.
306+ Files must be less than
307+ 100MB and 400 pages. We use LibreOffice to convert DOC(X) and PPT(X) files to
308+ PDF, which may affect page count.
305309
310+ See our [blog post](https://contextual.ai/blog/document-parser-for-rag) and
311+ [code examples](https://github.com/ContextualAI/examples/blob/main/03-standalone-api/04-parse/parse.ipynb).
306312307313 any feedback or questions.
308314
309315 Args:
310316 raw_file: The file to be parsed. The file type must be PDF, DOC / DOCX, PPT / PPTX.
311317
312- enable_document_hierarchy: Controls parsing heading levels (e.g. H1, H2, H3) at higher quality. Adds a
313- table of contents to the output with the structure of the entire parsed
314- document . Not permitted in ' basic' parsing_mode, or if page_range is not
315- continuous and/or does not start from page zero.
318+ enable_document_hierarchy: Adds a table of contents to the output with the structure of the entire parsed
319+ document. This feature is in beta. Controls parsing heading levels (e.g. H1, H2,
320+ H3) at higher quality . Not permitted in ` basic` parsing_mode, or if page_range
321+ is not continuous and/or does not start from page zero.
316322
317323 enable_split_tables: Controls whether tables are split into multiple tables by row with the headers
318324 propagated. Use for improving LLM comprehension of very large tables. Not
319- permitted in ' basic' parsing_mode.
325+ permitted in ` basic` parsing_mode.
320326
321- figure_caption_mode: Controls how thorough figure captions are. ' concise' is short and minimizes
322- chances of hallucinations. ' detailed' is more thorough and can include
323- commentary. Not permitted in ' basic' parsing_mode.
327+ figure_caption_mode: Controls how thorough figure captions are. ` concise` is short and minimizes
328+ chances of hallucinations. ` detailed` is more thorough and can include
329+ commentary; this mode is in beta . Not permitted in ` basic` parsing_mode.
324330
325331 max_split_table_cells: Threshold number of table cells beyond which large tables are split if
326- `enable_split_tables` is True. Not permitted in ' basic' parsing_mode.
332+ `enable_split_tables` is True. Not permitted in ` basic` parsing_mode.
327333
328334 page_range: Optional string representing page range to be parsed. Format: comma-separated
329- indexes (0-based) e.g. ' 0,1,2,5,6' or ranges ( inclusive of both ends) e.g.
330- ' 0-2,5,6'
335+ indexes (0-based, e.g. ` 0,1,2,5,6`), or ranges inclusive of both ends ( e.g.
336+ ` 0-2,5,6`)
331337
332- parse_mode: The settings to use for parsing. ' basic' is for simple, text-only documents.
333- ' standard' is for complex documents with images, complex hierarchy, and/or no
338+ parse_mode: The settings to use for parsing. ` basic` is for simple, text-only documents.
339+ ` standard` is for complex documents with images, complex hierarchy, and/or no
334340 natively encoded textual data (e.g. for scanned documents).
335341
336342 extra_headers: Send extra headers
@@ -389,11 +395,11 @@ async def job_results(
389395 job_id: Unique ID of the parse job
390396
391397 output_types: The desired output format(s) of the parsed file. Must be `markdown-document`,
392- `markdown-per-page`, and/or `blocks-per-page`. `markdown-document` parses the
393- whole document into a single concatenated markdown output. `markdown-per-page`
394- provides markdown output per page. `blocks-per-page` provides a structured JSON
398+ `markdown-per-page`, and/or `blocks-per-page`. Specify multiple values to get
399+ multiple formats in the response. `markdown-document` parses the whole document
400+ into a single concatenated markdown output. `markdown-per-page` provides
401+ markdown output per page. `blocks-per-page` provides a structured JSON
395402 representation of the content blocks on each page, sorted by reading order.
396- Specify multiple values to get multiple formats in the response.
397403
398404 extra_headers: Send extra headers
399405
0 commit comments