Skip to content

Commit

Permalink
update master (#391)
Browse files Browse the repository at this point in the history
* TLDR 531 pdf_txtlayer_reader table fix (#380)

* TLDR-538 tesseract trustai (#377)

* fixed training script (#383)

* TLDR-521 Fix splittext for file names with several dots (#385)

* TLDR-527 refactor methods and parameters for all main classes (#387)

* Add attach and table annotations to PPTX (#389)

* TLDR-544 docx bugs (#382)

* TLDR-516 GPU in docker (#384)

* new version 2.0 (#390)

---------

Co-authored-by: raxtemur <[email protected]>
Co-authored-by: Oksana Belyaeva <[email protected]>
Co-authored-by: Alexander Golodkov <[email protected]>
Co-authored-by: Alexander Golodkov <[email protected]>
Co-authored-by: Nikita Shevtsov <[email protected]>
  • Loading branch information
6 people authored Dec 25, 2023
1 parent d83bf23 commit 1888659
Show file tree
Hide file tree
Showing 133 changed files with 1,963 additions and 1,050 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.1.1
2.0
28 changes: 20 additions & 8 deletions dedoc/attachments_extractors/abstract_attachment_extractor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
import uuid
from abc import ABC, abstractmethod
Expand All @@ -11,29 +12,40 @@ class AbstractAttachmentsExtractor(ABC):
"""
This class is responsible for extracting files attached to the documents of different formats.
"""
def __init__(self, *, config: Optional[dict] = None) -> None:
"""
:param config: configuration of the attachments extractor, e.g. logger for logging
"""
self.config = {} if config is None else config
self.logger = self.config.get("logger", logging.getLogger())

@abstractmethod
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
def can_extract(self,
file_path: Optional[str] = None,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
"""
Check if this attachments extractor can get attachments of the file with the given extension.
Check if this attachments extractor can get attachments of the file.
You should provide at least one of the following parameters: file_path, extension, mime.
:param extension: file extension, for example .doc or .pdf
:param file_path: the path of the file to extract attachments from
:param extension: file extension with a dot, for example .doc or .pdf
:param mime: MIME type of file
:param parameters: any additional parameters for given document
:param parameters: any additional parameters for the given document
:return: the indicator of possibility to get attachments of this file
"""
pass

@abstractmethod
def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Extract attachments from the given file.
This method can only be called on appropriate files, ensure that \
:meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract` is True for the given file.
:param tmpdir: directory where file is located and where the attached files will be saved
:param filename: name of the file to extract attachments (not absolute path)
:param parameters: dict with different parameters for extracting
:param file_path: path of the file to extract attachments from
:param parameters: dict with different parameters for extracting, see :ref:`attachments_handling_parameters` for more details
:return: list of file's attachments
"""
pass
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import zipfile
from abc import ABC
from typing import List, Tuple
from typing import List, Optional, Tuple

import olefile
from charset_normalizer import from_bytes
Expand All @@ -14,6 +14,9 @@ class AbstractOfficeAttachmentsExtractor(AbstractAttachmentsExtractor, ABC):
"""
Extract attachments from files of Microsoft Office format like docx, pptx, xlsx.
"""
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)

def __parse_ole_contents(self, stream: bytes) -> Tuple[str, bytes]:
"""
Parse the binary content of olefile.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,36 @@
from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.utils.utils import get_mime_extension


class DocxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
"""
Extract attachments from docx files.
"""
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)

def can_extract(self,
file_path: Optional[str] = None,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
"""
Checks if this extractor can get attachments from the document (it should have .docx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format

def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Get attachments from the given docx document.
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
result = []
try:
with zipfile.ZipFile(os.path.join(tmpdir, filename), "r") as zfile:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,28 +1,39 @@
import os
from typing import List, Optional

from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.utils.utils import splitext_
from dedoc.utils.utils import get_mime_extension, splitext_


class ExcelAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
"""
Extracts attachments from xlsx files.
"""
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)

def can_extract(self,
file_path: Optional[str] = None,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
"""
Checks if this extractor can get attachments from the document (it should have .xlsx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.excel_like_format or mime in recognized_mimes.excel_like_format

def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Get attachments from the given xlsx document.
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
attachments = []
name, ext = splitext_(filename)
if ext.lower() != ".xlsx":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,28 @@

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.utils.utils import get_mime_extension


class JsonAttachmentsExtractor(AbstractAttachmentsExtractor):
"""
Extract attachments from json files.
"""
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)

def can_extract(self,
file_path: Optional[str] = None,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
"""
Checks if this extractor can get attachments from the document (it should have .json extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower().endswith(".json")

def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Get attachments from the given json document.
Attached files are html files if the option `html_fields` is given in the `parameters`.
Expand All @@ -33,6 +42,8 @@ def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
attachments = []

with open(os.path.join(tmpdir, filename)) as f:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import logging
import json
import os
import uuid
from typing import List, Optional, Tuple
Expand All @@ -8,36 +8,39 @@
from PyPDF2.utils import PdfReadError

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.attachments_extractors.utils import create_note
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.utils.utils import convert_datetime
from dedoc.utils.utils import convert_datetime, get_mime_extension, get_unique_name


class PDFAttachmentsExtractor(AbstractAttachmentsExtractor):
"""
Extract attachments from pdf files.
"""
def __init__(self, *, config: dict) -> None:
"""
:param config: configuration of the extractor, e.g. logger for logging
"""
self.config = config
self.logger = config.get("logger", logging.getLogger())
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)

def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
def can_extract(self,
file_path: Optional[str] = None,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
"""
Checks if this extractor can get attachments from the document (it should have .pdf extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format

def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Get attachments from the given pdf document.
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)

with open(os.path.join(tmpdir, filename), "rb") as handler:
try:
reader = PyPDF2.PdfFileReader(handler)
Expand Down Expand Up @@ -74,7 +77,7 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
user = note.get("/T")
data = note.get("/Contents", "")

name, content = create_note(content=data, modified_time=modified_time, created_time=created_time, author=user)
name, content = self.__create_note(content=data, modified_time=modified_time, created_time=created_time, author=user)
attachments.append((name, bytes(content)))
return attachments

Expand Down Expand Up @@ -108,3 +111,16 @@ def __get_root_attachments(self, reader: PyPDF2.PdfFileReader) -> List[Tuple[str
attachments.append((name, data))

return attachments

def __create_note(self, content: str, modified_time: int, created_time: int, author: str, size: int = None) -> [str, bytes]:
filename = get_unique_name("note.json")
note_dict = {
"content": content,
"modified_time": modified_time,
"created_time": created_time,
"size": size if size else len(content),
"author": author
}
encode_data = json.dumps(note_dict).encode("utf-8")

return filename, encode_data
Original file line number Diff line number Diff line change
@@ -1,28 +1,39 @@
import os
from typing import List, Optional

from dedoc.attachments_extractors.concrete_attachments_extractors.abstract_office_attachments_extractor import AbstractOfficeAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.utils.utils import splitext_
from dedoc.utils.utils import get_mime_extension, splitext_


class PptxAttachmentsExtractor(AbstractOfficeAttachmentsExtractor):
"""
Extract attachments from pptx files.
"""
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
def __init__(self, *, config: Optional[dict] = None) -> None:
super().__init__(config=config)

def can_extract(self,
file_path: Optional[str] = None,
extension: Optional[str] = None,
mime: Optional[str] = None,
parameters: Optional[dict] = None) -> bool:
"""
Checks if this extractor can get attachments from the document (it should have .pptx extension)
"""
extension, mime = get_mime_extension(file_path=file_path, mime=mime, extension=extension)
return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format

def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
"""
Get attachments from the given pptx document.
Look to the :class:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor` documentation to get the information about \
the methods' parameters.
"""
parameters = {} if parameters is None else parameters
tmpdir, filename = os.path.split(file_path)
result = []
name, ext = splitext_(filename)

Expand Down
17 changes: 0 additions & 17 deletions dedoc/attachments_extractors/utils.py

This file was deleted.

16 changes: 8 additions & 8 deletions dedoc/attachments_handler/attachments_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import logging
import os
import time
from typing import List
from typing import List, Optional

from dedoc.attachments_extractors import AbstractAttachmentsExtractor
from dedoc.common.exceptions.dedoc_error import DedocError
Expand All @@ -22,11 +22,11 @@ class AttachmentsHandler:
the parsing recursion may be set via `recursion_deep_attachments` parameter.
"""

def __init__(self, *, config: dict) -> None:
def __init__(self, *, config: Optional[dict] = None) -> None:
"""
:param config: configuration of the handler, e.g. logger for logging
"""
self.config = config
self.config = {} if config is None else config
self.logger = self.config.get("logger", logging.getLogger())

def handle_attachments(self, document_parser: "DedocManager", document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]: # noqa
Expand Down Expand Up @@ -77,10 +77,10 @@ def handle_attachments(self, document_parser: "DedocManager", document: Unstruct
return parsed_attachment_files

def __get_empty_document(self, document_parser: "DedocManager", attachment: AttachedFile, parameters: dict) -> ParsedDocument: # noqa
attachment_dir, attachment_name = os.path.split(attachment.get_filename_in_path())
metadata = document_parser.document_metadata_extractor.extract_metadata(directory=attachment_dir,
filename=attachment_name, converted_filename=attachment_name,
original_filename=attachment.get_original_filename(),
parameters=parameters)
metadata = document_parser.document_metadata_extractor.extract(
file_path=attachment.get_filename_in_path(),
original_filename=attachment.get_original_filename(),
parameters=parameters
)
metadata = DocumentMetadata(**metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)
Loading

0 comments on commit 1888659

Please sign in to comment.