Skip to content

Commit 24d6fdc

Browse files
feat: notion syncblock fix by teddysupercuts (#530)
This PR is @teddysupercuts contribution from his fork. Original PR can be found [here.](#508) Re-opening this PR from origin so that our tests work correctly. --------- Co-authored-by: Teddy Wahle <[email protected]>
1 parent b2a3e77 commit 24d6fdc

File tree

3 files changed

+61
-4
lines changed

3 files changed

+61
-4
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.0.36
2+
3+
* **Added Notion connector sync block handling by teddysupercuts**
4+
15
## 1.0.35
26

37
* **Fix output path in blob storage destination connector**

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.35" # pragma: no cover
1+
__version__ = "1.0.36" # pragma: no cover

unstructured_ingest/processes/connectors/notion/types/blocks/synced_block.py

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@ def can_have_children() -> bool:
1818

1919
@classmethod
2020
def from_dict(cls, data: dict):
21+
"""Create OriginalSyncedBlock from dictionary data.
22+
23+
Original blocks contain children content.
24+
"""
25+
if "children" not in data:
26+
raise ValueError(f"OriginalSyncedBlock data missing 'children': {data}")
2127
return cls(children=data["children"])
2228

2329
def get_html(self) -> Optional[HtmlTag]:
@@ -31,27 +37,74 @@ class DuplicateSyncedBlock(BlockBase):
3137

3238
@staticmethod
3339
def can_have_children() -> bool:
40+
"""Check if duplicate synced blocks can have children.
41+
42+
Duplicate blocks themselves don't have children directly fetched here,
43+
but they represent content that does, so Notion API might report has_children=True
44+
on the parent block object. The actual children are fetched from the original block.
45+
"""
3446
return True
3547

3648
@classmethod
3749
def from_dict(cls, data: dict):
38-
return cls(**data)
50+
"""Create DuplicateSyncedBlock from dictionary data.
51+
52+
Duplicate blocks contain a 'synced_from' reference.
53+
"""
54+
synced_from_data = data.get("synced_from")
55+
if not synced_from_data or not isinstance(synced_from_data, dict):
56+
raise ValueError(f"Invalid data structure for DuplicateSyncedBlock: {data}")
57+
# Ensure required keys are present in the nested dictionary
58+
if "type" not in synced_from_data or "block_id" not in synced_from_data:
59+
raise ValueError(
60+
f"Missing 'type' or 'block_id' in synced_from data: {synced_from_data}"
61+
)
62+
return cls(type=synced_from_data["type"], block_id=synced_from_data["block_id"])
3963

4064
def get_html(self) -> Optional[HtmlTag]:
65+
"""Get HTML representation of the duplicate synced block.
66+
67+
HTML representation might need fetching the original block's content,
68+
which is outside the scope of this simple data class.
69+
"""
4170
return None
4271

4372

4473
class SyncBlock(BlockBase):
4574
@staticmethod
4675
def can_have_children() -> bool:
76+
"""Check if synced blocks can have children.
77+
78+
Synced blocks (both original and duplicate) can conceptually have children.
79+
"""
4780
return True
4881

4982
@classmethod
5083
def from_dict(cls, data: dict):
51-
if "synced_from" in data:
84+
"""Create appropriate SyncedBlock subclass from dictionary data.
85+
86+
Determine if it's a duplicate (has 'synced_from') or original (has 'children').
87+
"""
88+
if data.get("synced_from") is not None:
89+
# It's a duplicate block containing a reference
90+
return DuplicateSyncedBlock.from_dict(data)
91+
elif "children" in data:
92+
# It's an original block containing children
5293
return OriginalSyncedBlock.from_dict(data)
5394
else:
54-
return DuplicateSyncedBlock.from_dict(data)
95+
# Handle cases where neither 'synced_from' nor 'children' are present.
96+
# Notion API might return this for an empty original synced block.
97+
# Let's treat it as an empty OriginalSyncedBlock.
98+
# If this assumption is wrong, errors might occur later.
99+
# Consider logging a warning here if strictness is needed.
100+
return OriginalSyncedBlock(children=[])
101+
55102

56103
def get_html(self) -> Optional[HtmlTag]:
104+
"""Get HTML representation of the synced block.
105+
106+
The specific instance returned by from_dict (Original or Duplicate)
107+
will handle its own get_html logic.
108+
This method on the base SyncBlock might not be directly called.
109+
"""
57110
return None

0 commit comments

Comments
 (0)