Skip to content

Commit 54ebdc7

Browse files
committed
Issue 6: add import stat monitor
1 parent 956a7f3 commit 54ebdc7

13 files changed

+388
-112
lines changed

demos/Test Case G.html

Lines changed: 147 additions & 0 deletions
Large diffs are not rendered by default.

demos/Test Case H.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
3+
<html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/><meta name="exporter-version" content="Evernote Mac 9.6.8 (470886)"/><meta name="created" content="2015-09-25 18:06:32 +0000"/><meta name="source" content="mobile.android"/><meta name="updated" content="2015-09-25 18:06:32 +0000"/><title>su</title></head><body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;"><div>su<div><br/></div>dd if=/sdcard/twrp.img of=/dev/block/platform/msm_sdcc.1/by-name/recovery<div><br/></div></div></body></html>

examples/parse_tag.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
"name": "python",
7979
"nbconvert_exporter": "python",
8080
"pygments_lexer": "ipython3",
81-
"version": "3.11.0"
81+
"version": "3.11.2"
8282
},
8383
"orig_nbformat": 4
8484
},

html2notion/main.py

Lines changed: 68 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
import argparse
22
import os
33
import sys
4-
from pathlib import Path
54
import asyncio
5+
from pathlib import Path
66
from aiohttp import ClientSession
77
from notion_client import AsyncClient
88
from rich.console import Console
99
from rich.table import Table
1010
from rich.text import Text
11+
from rich import box
1112
from .utils import setup_logger, read_config, logger, config
1213
from .translate.notion_import import NotionImporter
1314
from .translate.batch_import import BatchImport
15+
from .translate.import_stats import StatLevel
1416
console = Console()
1517

1618

@@ -27,22 +29,62 @@ def parse_args():
2729
return parser.parse_args()
2830

2931

30-
def print_fail_details(failed_files):
31-
if len(failed_files) == 0:
32+
def print_single_stats(stat):
33+
if stat.get_level() == StatLevel.EXCEPTION.value:
34+
text = Text(f"Failed to import {stat.filename}", style="default")
35+
text.append(f"\nException: {stat.exception}", style="red")
36+
console.print(text)
3237
return
33-
table = Table(title=f"\nFailed Detail\nLog path: {config.get('log_path')}")
34-
table.add_column("File Name", justify="left", style="cyan", no_wrap=True)
35-
table.add_column("Fail Reason", justify="left", style="red", no_wrap=True)
36-
37-
for row in failed_files:
38-
table.add_row(str(row[0].name), str(row[1]))
38+
39+
title = f"{stat.filename}" if stat.filename else "Import Result (Loss filename)"
40+
style = "default"
41+
if stat.get_level() == StatLevel.LOSS.value:
42+
title += " (Loss some content)"
43+
style = "yellow"
44+
elif stat.get_level() == StatLevel.SUCC.value:
45+
title += "(Import successfully)"
46+
style = "green"
47+
48+
table = Table(title=title, title_style=style, expand=True, box=box.HEAVY_HEAD, show_lines=True)
49+
table.add_column("Item", justify="right", style="default")
50+
table.add_column("Html", style="default")
51+
table.add_column("Notion", justify="left", style="default")
52+
table.add_row("Text Len", str(stat.text_count), str(stat.notion_text_count))
53+
table.add_row("Image Count", str(stat.image_count), str(stat.notion_image_count))
54+
if stat.skip_tag:
55+
table.add_row("Skip Tag Count", "", 'Detail: [yellow]' + ";".join([repr(s)
56+
for s in stat.skip_tag])[:2000] + "[/yellow]")
57+
3958
console.print(table)
4059

41-
text = Text("\nIf you need help, please submit an ")
42-
link = Text("issue", style="cyan underline link https://github.com/selfboot/html2notion/issues")
43-
text.append(link)
44-
text.append(" on gitHub.\n")
45-
console.print(text)
60+
61+
def print_batch_stats(batch_import):
62+
all_files = batch_import.all_files
63+
batch_stats = batch_import.batch_stats
64+
success_stats = [stat for stat in batch_stats if not stat.get_level() == StatLevel.SUCC.value]
65+
if len(success_stats) == len(all_files):
66+
console.print(f"All files migrated successfully and there is no data loss.", style="green")
67+
68+
failed_stats = [stat for stat in batch_stats if stat.get_level() == StatLevel.EXCEPTION.value]
69+
if failed_stats:
70+
table = Table(title=f"\nImport Fail Exception Detail\nLog path: {config.get('log_path')}", expand=True, box=box.HEAVY_HEAD, show_lines=True)
71+
table.add_column("File Name", justify="left", style="default")
72+
table.add_column("Fail Reason", justify="left", style="default")
73+
74+
for stat in failed_stats:
75+
table.add_row(str(stat.filename), str(stat))
76+
console.print(table)
77+
78+
less_stats = [stat for stat in batch_stats if stat.get_level() == StatLevel.LOSS.value]
79+
if less_stats:
80+
table = Table(title=f"\nImport Data Loss Detail (You can use --file to import single file for more info)\n", expand=True, box=box.HEAVY_HEAD, show_lines=True)
81+
table.add_column("File Name", justify="left", style="default")
82+
table.add_column("Loss Detail", justify="left", style="default")
83+
84+
for stat in less_stats:
85+
table.add_row(str(stat.filename), str(stat))
86+
console.print(table)
87+
4688

4789

4890
def prepare_env(args: argparse.Namespace):
@@ -70,13 +112,9 @@ async def import_single_file(file):
70112
async with ClientSession() as session:
71113
async with AsyncClient(auth=notion_api_key) as notion_client:
72114
notion_importer = NotionImporter(session, notion_client)
73-
try:
74-
result = await notion_importer.process_file(file)
75-
logger.info(f"Finish file {file}")
76-
return ("succ", result)
77-
except Exception as e:
78-
logger.error(f"Error processing {file}: {str(e)}")
79-
return ("fail", str(e))
115+
await notion_importer.process_file(file)
116+
return notion_importer.import_stats
117+
80118

81119
def main():
82120
args = parse_args()
@@ -86,32 +124,22 @@ def main():
86124
dir_path = Path(args.dir) if args.dir else None
87125
max_concurrency = args.batch
88126
if file_path and file_path.is_file():
89-
result = asyncio.run(import_single_file(file_path))
90-
text = Text("Single file ", style="default")
91-
text.append(f"{file_path} ", style="cyan")
92-
if result[0] == "fail":
93-
text.append("save to notion failed.", style="default")
94-
text.append(f"\n{result[1]}", style="red")
95-
else:
96-
text.append("save to notion success.", style="default")
97-
console.print(text)
127+
stats = asyncio.run(import_single_file(file_path))
128+
print_single_stats(stats)
98129
elif dir_path and dir_path.is_dir():
99130
logger.info(f"Begin save all html files in the dir: {dir_path}.")
100131
batch_import = BatchImport(dir_path, max_concurrency)
101132
result = asyncio.run(batch_import.process_directory())
102133
logger.info(f"Finish save all html files in the dir: {dir_path}.\n{result}")
103-
104-
if len(batch_import.success_files) == len(batch_import.all_files):
105-
console.print(f"All files processed success.", style="green")
106-
107-
print_fail_details(batch_import.failed_files)
134+
print_batch_stats(batch_import)
108135
else:
109136
text = Text("The parameters provided are incorrect, please check.", style="red")
110-
text.append("\nIf you need help, please submit an ", style="default")
111-
link = Text("issue", style="cyan underline link https://github.com/selfboot/html2notion/issues")
112-
text.append(link)
113-
text.append(" on gitHub.", style="default")
114-
console.print(text)
137+
138+
text = Text("\nIf you need help, please submit an ", style="default")
139+
link = Text("issue", style="cyan underline link https://github.com/selfboot/html2notion/issues")
140+
text.append(link)
141+
text.append(" on gitHub.", style="default")
142+
console.print(text)
115143
return
116144

117145

html2notion/translate/batch_import.py

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,33 +24,19 @@ def __init__(self, directory: Path, concurrent_limit: int = 10):
2424
else:
2525
self.notion_api_key = config['notion']['api_key']
2626
self.notion_client = AsyncClient(auth=self.notion_api_key)
27-
self.all_files = []
28-
self.failed_files = []
29-
self.success_files = []
27+
self.batch_stats = []
3028
self.files_lock = Lock()
3129

3230
@staticmethod
33-
async def process_file(session, notion_client, file_path, files_lock, failed_files, success_files):
31+
async def process_file(session, notion_client, file_path, files_lock, batch_stats):
3432
logger.info(f"Begin file, file {file_path}")
3533
notion_import = NotionImporter(session, notion_client)
36-
if file_path.is_file():
37-
try:
38-
response = await notion_import.process_file(file_path)
39-
logger.info(f"Finish file {file_path}")
40-
async with files_lock:
41-
success_files.append(file_path)
42-
return response
43-
except Exception as e:
44-
logger.error(f"Error processing {file_path}: {str(e)}")
45-
async with files_lock:
46-
failed_files.append((file_path, str(e)))
47-
return None
48-
else:
49-
logger.error(f"Error processing {file_path}: File not found")
50-
async with files_lock:
51-
failed_files.append((file_path, "File not found"))
52-
return None
53-
34+
response = await notion_import.process_file(file_path)
35+
logger.info(f"Finish file {file_path}, status {str(notion_import.import_stats)}")
36+
async with files_lock:
37+
batch_stats.append(notion_import.import_stats)
38+
return response
39+
5440
async def process_directory(self):
5541
semaphore = asyncio.Semaphore(self.concurrent_limit)
5642
self.all_files = [file_path for file_path in self.directory.glob('*.html') if file_path.name != 'index.html']
@@ -71,8 +57,8 @@ async def process_directory(self):
7157
failed_task_id = progress.add_task("[red]Failed", total=files_len, style="red")
7258
async def process_file_with_semaphore(session, notion_client, file_path):
7359
async with semaphore:
74-
result = await self.process_file(session, notion_client, file_path, self.files_lock, self.failed_files, self.success_files)
75-
if result:
60+
result = await self.process_file(session, notion_client, file_path, self.files_lock, self.batch_stats)
61+
if result == "succ":
7662
progress.update(success_task_id, advance=1)
7763
else:
7864
progress.update(failed_task_id, advance=1)

html2notion/translate/html2json.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,35 +61,35 @@ def _infer_input_type(html_content):
6161
return Default_Type
6262

6363

64-
def _get_converter(html_content):
64+
def _get_converter(html_content, import_stat):
6565
html_type = _infer_input_type(html_content)
6666
logger.info(f"Input type: {html_type}")
67-
converter = Html2JsonBase.create(html_type, html_content)
67+
converter = Html2JsonBase.create(html_type, html_content, import_stat)
6868
return converter
6969

7070

7171
@singledispatch
72-
def html2json_process(html_content):
73-
raise TypeError("Unsupported param type")
72+
def html2json_process(html_content, import_stat):
73+
raise TypeError(f"Unsupported {type(html_content)}, {import_stat}")
7474

7575

7676
@html2json_process.register
77-
def _(html_content: str):
78-
converter = _get_converter(html_content)
77+
def _(html_content: str, import_stat):
78+
converter = _get_converter(html_content, import_stat)
7979
result = converter.process()
8080
return converter.get_notion_data(), result
8181

8282

8383
@html2json_process.register
84-
def _(html_file: Path):
84+
def _(html_file: Path, import_stat):
8585
if not html_file.is_file():
8686
print(f"Load file: {html_file.resolve()} failed")
87-
sys.exit(1)
87+
raise FileNotFoundError
8888

8989
with open(html_file, "r") as file:
9090
html_content = file.read()
9191

92-
converter = _get_converter(html_content)
92+
converter = _get_converter(html_content, import_stat)
9393
result = converter.process()
9494
return converter.get_notion_data(), result
9595

html2notion/translate/html2json_base.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,12 @@ class Html2JsonBase:
4343
]
4444

4545
# Page content should be: https://developers.notion.com/reference/post-page
46-
def __init__(self, html_content):
46+
def __init__(self, html_content, import_stat):
4747
self.html_content = html_content
4848
self.children = []
4949
self.properties = {}
5050
self.parent = {}
51+
self.import_stat = import_stat
5152
if 'GITHUB_ACTIONS' in os.environ:
5253
notion_database_id = os.environ['notion_db_id_1']
5354
else:
@@ -123,8 +124,7 @@ def parse_one_style(tag_soup: Tag, text_params: dict):
123124
# <b><u>unlineline and bold</u></b>
124125
# <div><font color="#ff2600">Red color4</font></div>
125126
# <div> Code in super note</div>
126-
@staticmethod
127-
def generate_inline_obj(tag: PageElement):
127+
def generate_inline_obj(self, tag: PageElement):
128128
res_obj = []
129129
text_with_parents = Html2JsonBase.extract_text_and_parents(tag)
130130
for (text, parent_tags) in text_with_parents:
@@ -134,7 +134,6 @@ def generate_inline_obj(tag: PageElement):
134134
text_params = {"plain_text": chunk}
135135
for parent in parent_tags:
136136
Html2JsonBase.parse_one_style(parent, text_params)
137-
138137
# process inline line break
139138
if chunk == "<br>":
140139
try:
@@ -145,29 +144,32 @@ def generate_inline_obj(tag: PageElement):
145144
continue
146145

147146
if text_params.get("url", ""):
148-
text_obj = Html2JsonBase.generate_link(**text_params)
147+
text_obj = self.generate_link(**text_params)
149148
else:
150-
text_obj = Html2JsonBase.generate_text(**text_params)
149+
text_obj = self.generate_text(**text_params)
151150
if text_obj:
152151
res_obj.append(text_obj)
153152
return res_obj
154153

155-
@staticmethod
156-
def generate_link(**kwargs):
157-
if not kwargs.get("plain_text", ""):
154+
155+
156+
def generate_link(self, **kwargs):
157+
plain_text = kwargs.get("plain_text", "")
158+
if not plain_text:
158159
return
160+
161+
self.import_stat.add_notion_text(plain_text)
159162
return {
160163
"href": kwargs.get("url", ""),
161-
"plain_text": kwargs.get("plain_text", ""),
164+
"plain_text": plain_text,
162165
"text": {
163166
"link": {"url": kwargs.get("url", "")},
164-
"content": kwargs.get("plain_text", "")
167+
"content": plain_text
165168
},
166169
"type": "text"
167170
}
168171

169-
@staticmethod
170-
def generate_text(**kwargs):
172+
def generate_text(self, **kwargs):
171173
plain_text = kwargs.get("plain_text", "")
172174
if not plain_text:
173175
return
@@ -176,7 +178,9 @@ def generate_text(**kwargs):
176178
for key, value in kwargs.items()
177179
if key in Html2JsonBase._text_annotations and isinstance(value, Html2JsonBase._text_annotations[key])
178180
}
179-
181+
stats_count = kwargs.get("stats_count", True)
182+
if stats_count:
183+
self.import_stat.add_notion_text(plain_text)
180184
text_obj = {
181185
"plain_text": plain_text,
182186
"text": {"content": plain_text},
@@ -187,8 +191,7 @@ def generate_text(**kwargs):
187191

188192
return text_obj
189193

190-
@staticmethod
191-
def generate_properties(**kwargs):
194+
def generate_properties(self, **kwargs):
192195
title = kwargs.get("title", "")
193196
url = kwargs.get("url", "")
194197
tags = kwargs.get("tags", [])
@@ -405,7 +408,7 @@ def _convert_one_list_item(self, soup, list_type):
405408
"type": list_type,
406409
}
407410
rich_text = json_obj[list_type]["rich_text"]
408-
text_obj = Html2JsonBase.generate_inline_obj(soup)
411+
text_obj = self.generate_inline_obj(soup)
409412
if text_obj:
410413
rich_text.extend(text_obj)
411414

@@ -439,7 +442,7 @@ def convert_table(self, soup):
439442
}
440443
}
441444
for td in td_tags:
442-
col = Html2JsonBase.generate_inline_obj(td)
445+
col = self.generate_inline_obj(td)
443446
one_row["table_row"]["cells"].append(col)
444447
table_rows.append(one_row)
445448

@@ -458,8 +461,8 @@ def register(cls, input_type, subclass):
458461
cls._registry[input_type] = subclass
459462

460463
@classmethod
461-
def create(cls, input_type, html_content):
464+
def create(cls, input_type, html_content, import_stat):
462465
subclass = cls._registry.get(input_type)
463466
if subclass is None:
464467
raise ValueError(f"noknown: {input_type}")
465-
return subclass(html_content)
468+
return subclass(html_content, import_stat)

0 commit comments

Comments
 (0)