update master (#363)

* ESL-155 Add table bbox annotations to tabby reader (#354) * Add table bbox annotations to tabby reader * Fix tests * Review fixes --------- Co-authored-by: Nasty <[email protected]> * TLDR-476 change swagger (#357) * Use fastapi swagger, add pydantic classes and documentation * Fix documentation and examples * TLDR-465 pdf miner new params (#356) * set char_margin to 3 * add pdf miner test script * fix test_pdf_miner script * fix TestApiPdfWithText * add chaching * rename test to benchmark * add benchmark script again * change name * change name * Try to fix documentation pipeline * fix benchmark --------- Co-authored-by: Nikita Shevtsov <[email protected]> Co-authored-by: Nasty <[email protected]> * ESL-165 table bboxes bug (#358) * ESL-165 Added test with hard tables * ESL-165 fixed bug box extraction in payment_order * ESL-165 after rebase * ESL-165 update README.md * ESL-165 after review --------- Co-authored-by: Nasty <[email protected]> * TLDR-367 refactor metadata extractor (#359) * change add_metadata to extract_metadata in metadata readers * fix usage of extract_metadata * fix docs * change output type to dict * fix code style * fix pr --------- Co-authored-by: Nikita Shevtsov <[email protected]> * ESL-167 extract only word boxes (#360) * ESL-167 extract only word boxes * ESL-167 extract only words bboxes for tabby reader --------- Co-authored-by: Nasty <[email protected]> * TLDR-502 increase converter timeout (#361) * new version 1.1.0 (#362) --------- Co-authored-by: Andrey Mikhailov <[email protected]> Co-authored-by: Nikita Shevtsov <[email protected]> Co-authored-by: Nikita Shevtsov <[email protected]> Co-authored-by: Oksana Belyaeva <[email protected]>
ispras · Oct 24, 2023 · b79dd4c · b79dd4c
1 parent ff26829
commit b79dd4c
Show file tree

Hide file tree

Showing 96 changed files with 719 additions and 765 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -19,6 +19,7 @@ jobs:
 
     - name: Install dependencies
       run: |
+        sudo apt update
         sudo apt-get install -y libreoffice djvulibre-bin poppler-utils tesseract-ocr libtesseract-dev tesseract-ocr-rus tesseract-ocr-eng
         python -m pip install --upgrade --no-cache-dir pip setuptools
         python -m pip install --exists-action=w --no-cache-dir -r requirements.txt

diff --git a/README.md b/README.md
@@ -9,6 +9,12 @@ It extracts a document’s logical structure and content, its tables, text forma
 The document’s content is represented as a tree storing headings and lists of any level. 
 Dedoc can be integrated in a document contents and structure analysis system as a separate module.
 
+## Workflow
+
+![Workflow](https://github.com/ispras/dedoc/raw/master/docs/source/_static/workflow.png)
+
+Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow)
+
 ## Features and advantages
 Dedoc is implemented in Python and works with semi-structured data formats (DOC/DOCX, ODT, XLS/XLSX, CSV, TXT, JSON) and none-structured data formats like images (PNG, JPG etc.), archives (ZIP, RAR etc.), PDF and HTML formats. 
 Document structure extraction is fully automatic regardless of input data type. 
@@ -53,6 +59,8 @@ still, the docker application should be installed and configured properly.
 
 If you don't need to change the application configuration, you may use the built docker image as well.
 
+## Work with dedoc as service
+
 ### 1. Pull the image
 ```shell
 docker pull dedocproject/dedoc

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.0
+1.1.0
diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py
@@ -14,6 +14,7 @@
 import dedoc
 from dedoc.api.api_args import QueryParameters
 from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
+from dedoc.api.schema.parsed_document import ParsedDocument
 from dedoc.common.exceptions.dedoc_error import DedocError
 from dedoc.common.exceptions.missing_file_error import MissingFileError
 from dedoc.config import get_config
@@ -60,7 +61,7 @@ def _get_static_file_path(request: Request) -> str:
     return os.path.abspath(os.path.join(directory, file))
 
 
-@app.post("/upload")
+@app.post("/upload", response_model=ParsedDocument)
 async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:  # noqa
     parameters = dataclasses.asdict(query_params)
     if not file or file.filename == "":
@@ -81,15 +82,15 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
         html_content = json2tree(paragraph=document_tree.content.structure)
         return HTMLResponse(content=html_content)
     elif return_format == "ujson":
-        return UJSONResponse(content=document_tree.to_dict())
+        return UJSONResponse(content=document_tree.to_api_schema().model_dump())
     elif return_format == "collapsed_tree":
         html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
         return HTMLResponse(content=html_content)
     elif return_format == "pretty_json":
-        return PlainTextResponse(content=json.dumps(document_tree.to_dict(), ensure_ascii=False, indent=2))
+        return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
     else:
         logger.info(f"Send result. File {file.filename} with parameters {parameters}")
-        return ORJSONResponse(content=document_tree.to_dict())
+        return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
 
 
 @app.get("/upload_example")
@@ -100,7 +101,7 @@ async def upload_example(file_name: str, return_format: Optional[str] = None) ->
 
     if return_format == "html":
         return HTMLResponse(content=json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0))
-    return ORJSONResponse(content=document_tree.to_dict(), status_code=200)
+    return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200)
 
 
 @app.exception_handler(DedocError)

diff --git a/dedoc/api/models/__init__.py b/dedoc/api/models/__init__.py
diff --git a/dedoc/api/models/custom_fields.py b/dedoc/api/models/custom_fields.py
diff --git a/dedoc/api/schema/__init__.py b/dedoc/api/schema/__init__.py
@@ -0,0 +1,13 @@
+from .annotation import Annotation
+from .cell_with_meta import CellWithMeta
+from .document_content import DocumentContent
+from .document_metadata import DocumentMetadata
+from .line_metadata import LineMetadata
+from .line_with_meta import LineWithMeta
+from .parsed_document import ParsedDocument
+from .table import Table
+from .table_metadata import TableMetadata
+from .tree_node import TreeNode
+
+__all__ = ["Annotation", "CellWithMeta", "DocumentContent", "DocumentMetadata", "LineMetadata", "LineWithMeta", "ParsedDocument", "Table", "TableMetadata",
+           "TreeNode"]
diff --git a/dedoc/api/schema/annotation.py b/dedoc/api/schema/annotation.py
@@ -0,0 +1,12 @@
+from pydantic import BaseModel, Field
+
+
+class Annotation(BaseModel):
+    """
+    The piece of information about the text line: it's appearance or links to another document object.
+    For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic.
+    """
+    start: int = Field(description="Start of the annotated text", example=0)
+    end: int = Field(description="End of the annotated text (end isn't included)", example=5)
+    name: str = Field(description="Annotation name", example="italic")
+    value: str = Field(description="Annotation value. For example, it may be font size value for size type", example="True")
diff --git a/dedoc/api/schema/cell_with_meta.py b/dedoc/api/schema/cell_with_meta.py
@@ -0,0 +1,15 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from dedoc.api.schema.line_with_meta import LineWithMeta
+
+
+class CellWithMeta(BaseModel):
+    """
+    Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
+    """
+    lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations")
+    rowspan: int = Field(description="Number of rows to span like in HTML format", example=1)
+    colspan: int = Field(description="Number of columns to span like in HTML format", example=2)
+    invisible: bool = Field(description="Indicator for displaying or hiding cell text", example=False)
diff --git a/dedoc/api/schema/document_content.py b/dedoc/api/schema/document_content.py
@@ -0,0 +1,14 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from dedoc.api.schema.table import Table
+from dedoc.api.schema.tree_node import TreeNode
+
+
+class DocumentContent(BaseModel):
+    """
+    Content of the document - structured text and tables.
+    """
+    structure: TreeNode = Field(description="Tree structure where content of the document is organized")
+    tables: List[Table] = Field(description="List of document tables")
diff --git a/dedoc/api/schema/document_metadata.py b/dedoc/api/schema/document_metadata.py
@@ -0,0 +1,20 @@
+from typing import Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class DocumentMetadata(BaseModel):
+    """
+    Document metadata like its name, size, author, etc.
+    """
+    model_config = ConfigDict(extra="allow")
+
+    uid: str = Field(description="Document unique identifier (useful for attached files)", example="doc_uid_auto_ba73d76a-326a-11ec-8092-417272234cb0")
+    file_name: str = Field(description="Original document name before rename and conversion", example="example.odt")
+    temporary_file_name: str = Field(description="File name during parsing (unique name after rename and conversion)", example="123.odt")
+    size: int = Field(description="File size in bytes", example=20060)
+    modified_time: int = Field(description="Modification time of the document in the UnixTime format", example=1590579805)
+    created_time: int = Field(description="Creation time of the document in the UnixTime format", example=1590579805)
+    access_time: int = Field(description="File access time in the UnixTime format", example=1590579805)
+    file_type: str = Field(description="Mime type of the file", example="application/vnd.oasis.opendocument.text")
+    other_fields: Optional[dict] = Field(description="Other optional fields")
diff --git a/dedoc/api/schema/line_metadata.py b/dedoc/api/schema/line_metadata.py
@@ -0,0 +1,15 @@
+from typing import Optional
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class LineMetadata(BaseModel):
+    """
+    Holds information about document node/line metadata, such as page number or line type.
+    """
+    model_config = ConfigDict(extra="allow")
+
+    paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
+    page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
+    line_id: Optional[int] = Field(description="Line number", example=1)
+    other_fields: Optional[dict] = Field(description="Some other fields")
diff --git a/dedoc/api/schema/line_with_meta.py b/dedoc/api/schema/line_with_meta.py
@@ -0,0 +1,13 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from dedoc.api.schema.annotation import Annotation
+
+
+class LineWithMeta(BaseModel):
+    """
+    Textual line with text annotations.
+    """
+    text: str = Field(description="Text of the line", example="Some text")
+    annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)")
diff --git a/dedoc/api/schema/parsed_document.py b/dedoc/api/schema/parsed_document.py
@@ -0,0 +1,17 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from dedoc.api.schema.document_content import DocumentContent
+from dedoc.api.schema.document_metadata import DocumentMetadata
+
+
+class ParsedDocument(BaseModel):
+    """
+    Holds information about the document content, metadata and attachments.
+    """
+    content: DocumentContent = Field(description="Document text and tables")
+    metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on")
+    version: str = Field(description="Version of the program that parsed this document", example="0.9.1")
+    warnings: List[str] = Field(description="List of warnings and possible errors, arising in the process of document parsing")
+    attachments: List["ParsedDocument"] = Field(description="Result of analysis of attached files - list of `ParsedDocument`")
diff --git a/dedoc/api/schema/table.py b/dedoc/api/schema/table.py
@@ -0,0 +1,16 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from dedoc.api.schema.cell_with_meta import CellWithMeta
+from dedoc.api.schema.table_metadata import TableMetadata
+
+
+class Table(BaseModel):
+    """
+    Holds information about tables in the document.
+    We assume that a table has rectangle form (has the same number of columns in each row).
+    Table representation is row-based i.e. external list contains list of rows.
+    """
+    cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)")
+    metadata: TableMetadata = Field(description="Table meta information")
diff --git a/dedoc/api/schema/table_metadata.py b/dedoc/api/schema/table_metadata.py
@@ -0,0 +1,12 @@
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class TableMetadata(BaseModel):
+    """
+    Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
+    """
+    page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
+    uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")
+    rotated_angle: float = Field(description="Value of the rotation angle (in degrees) by which the table was rotated during recognition", example=1.0)
diff --git a/dedoc/api/schema/tree_node.py b/dedoc/api/schema/tree_node.py
@@ -0,0 +1,20 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+from dedoc.api.schema.annotation import Annotation
+from dedoc.api.schema.line_metadata import LineMetadata
+
+
+class TreeNode(BaseModel):
+    """
+    Helps to represent document as recursive tree structure.
+    It has list of children `TreeNode` nodes (empty list for a leaf node).
+    """
+    node_id: str = Field(description="Document element identifier. It is unique within a document content tree. "
+                                     "The identifier consists of numbers separated by dots where each number "
+                                     "means node's number among nodes with the same level in the document hierarchy.)", example="0.2.1")
+    text: str = Field(description="Text of the node", example="Some text")
+    annotations: List[Annotation] = Field(description="Some metadata related to the part of the text (as font size)")
+    metadata: LineMetadata = Field(description="Metadata for the entire node (as node type)")
+    subparagraphs: List["TreeNode"] = Field(description="List of children of this node, each child is `TreeNode`")
diff --git a/dedoc/attachments_handler/attachments_handler.py b/dedoc/attachments_handler/attachments_handler.py
@@ -95,11 +95,10 @@ def _handle_attachments(self, document: UnstructuredDocument, parameters: dict)
             attachment.tmp_file_path = new_path
 
     def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument:  # noqa
-        unstructured_document = UnstructuredDocument(lines=[], tables=[], attachments=[])
         attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
-        unstructured_document = document_parser.document_metadata_extractor.add_metadata(document=unstructured_document, directory=attachment_dir,
-                                                                                         filename=attachment_name, converted_filename=attachment_name,
-                                                                                         original_filename=attachment.get_original_filename(),
-                                                                                         parameters=parameters)
-        metadata = DocumentMetadata(**unstructured_document.metadata)
+        metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir,
+                                                                                filename=attachment_name, converted_filename=attachment_name,
+                                                                                original_filename=attachment.get_original_filename(),
+                                                                                parameters=parameters)
+        metadata = DocumentMetadata(**metadata)
         return ParsedDocument(content=get_empty_content(), metadata=metadata)
diff --git a/dedoc/converters/concrete_converters/abstract_converter.py b/dedoc/converters/concrete_converters/abstract_converter.py
@@ -16,7 +16,7 @@ def __init__(self, *, config: dict) -> None:
         """
         :param config: configuration of the converter, e.g. logger for logging
         """
-        self.timeout = 10
+        self.timeout = 60
         self.period_checking = 0.05
         self.config = config
         self.logger = config.get("logger", logging.getLogger())

diff --git a/dedoc/converters/concrete_converters/pdf_converter.py b/dedoc/converters/concrete_converters/pdf_converter.py
@@ -12,7 +12,6 @@ class PDFConverter(AbstractConverter):
     """
     def __init__(self, *, config: dict) -> None:
         super().__init__(config=config)
-        self.timeout = 60
 
     def can_convert(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
         """

diff --git a/dedoc/data_structures/__init__.py b/dedoc/data_structures/__init__.py
@@ -1,4 +1,3 @@
-# noqa
 import dedoc.data_structures.concrete_annotations as annotations
 from .annotation import Annotation
 from .attached_file import AttachedFile

diff --git a/dedoc/data_structures/annotation.py b/dedoc/data_structures/annotation.py
@@ -1,7 +1,4 @@
-from collections import OrderedDict
-
-from flask_restx import Api, Model, fields
-
+from dedoc.api.schema.annotation import Annotation as ApiAnnotation
 from dedoc.data_structures.serializable import Serializable
 
 
@@ -40,26 +37,5 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return f"{self.name.capitalize()}(...)"
 
-    def to_dict(self) -> dict:
-        res = OrderedDict()
-        res["start"] = self.start
-        res["end"] = self.end
-        res["name"] = self.name
-        res["value"] = self.value
-        return res
-
-    @staticmethod
-    def get_api_dict(api: Api) -> Model:
-        names = [
-            "style", "bold", "italic", "underlined", "size", "indentation", "alignment", "table",
-            "attachment", "spacing", "strike", "subscript", "superscript"
-        ]
-        return api.model("Annotation", {
-            "start": fields.Integer(description="annotation start index", required=True, example=0),
-            "end": fields.Integer(description="annotation end index", required=True, example=4),
-            "name": fields.String(description="annotation name", required=True, example="bold", enum=names),
-            "value": fields.String(description="annotation value. For example, it may be font size value for size type "
-                                               "or type of alignment for alignment type",
-                                   required=True,
-                                   example="left")
-        })
+    def to_api_schema(self) -> ApiAnnotation:
+        return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value)