Skip to content

Commit

Permalink
Changes after merge request review: fixed requirements, Docker Compos…
Browse files Browse the repository at this point in the history
…e, and added comments and docstrings to the assign_heading_levels function
  • Loading branch information
AnnaMarika01 committed Oct 2, 2024
1 parent 7b8ba2c commit 8d8e513
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 7 deletions.
4 changes: 1 addition & 3 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,4 @@ services:
pdfact:
image: "ghcr.io/data-house/pdfact:main"
networks:
- internal
ports:
- "4567:4567"
- internal
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ pymupdf==1.22.5
numpy~=1.24.3
requests==2.32.3
fastapi~=0.111.0
pydantic~=2.7.1
pydantic~=2.9.0
pydantic_settings~=2.2.1
uvicorn==0.22.0
parse-document-model==0.2.0
24 changes: 21 additions & 3 deletions text_extractor/parser/pdfact_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from collections import Counter
from typing import List, Dict
from typing import List, Dict, Any

import requests
from parse_document_model import Document, Page
Expand Down Expand Up @@ -95,7 +95,7 @@ def pdfact_to_document(json_data: dict) -> Document:

content = Text(
category=paragraph_detail['role'],
text=paragraph_detail['text'],
content=paragraph_detail['text'],
marks=marks,
attributes=attributes
)
Expand Down Expand Up @@ -313,26 +313,44 @@ def determine_heading_level(document: Document) -> Document:
return document


def assign_heading_levels(heading_styles):
def assign_heading_levels(heading_styles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Assigns heading levels to a list of heading styles based on font size and frequency.
:param heading_styles: A list of dictionaries where each dictionary contains
information about a heading's font name ('font_name')
and its font size ('font_size').
:return: A list of dictionaries, where each dictionary includes 'font_name',
'font_size', and the assigned 'level' (from 1 to 4).
Level 1 is for the largest and level 4 is for the smallest.
"""
# Count the number of occurrences for each font
font_count = Counter([font['font_name'] for font in heading_styles])

# Identify the most common font (likely the main heading font)
main_font = font_count.most_common(1)[0][0]
# Sort the main font headings by decreasing font size
main_fonts = sorted([f for f in heading_styles if f['font_name'] == main_font],
key=lambda x: -x['font_size'])
# Collect other fonts that are not the main font
other_fonts = [f for f in heading_styles if f['font_name'] != main_font]
levels_assigned = {}
# Assign levels (1-4) to the main font headings based on font size
for i, font in enumerate(main_fonts):
level = min(i + 1, 4)
levels_assigned[(font['font_name'], font['font_size'])] = level

# For other fonts, assign levels based on font size comparisons
for font in other_fonts:
size = font['font_size']
same_size_fonts = [f for f in levels_assigned if f[1] == size]

# If the same size exists, assign its level
if same_size_fonts:
level = levels_assigned[same_size_fonts[0]]
else:
# Otherwise, assign level based on size relative to existing main fonts
existing_sizes = sorted([f[1] for f in levels_assigned])

if size > existing_sizes[-1]:
Expand Down

0 comments on commit 8d8e513

Please sign in to comment.