selfboot
diff --git a/‎demos/yinxiang_markdown.html
Lines changed: 54 additions & 0 deletions b/‎demos/yinxiang_markdown.html
Lines changed: 54 additions & 0 deletions
diff --git a/‎demos/yinxiang_markdown.resources/5BB98FD9-8FA4-481F-AF4E-E3B1F2DD38BC.png
662 KB b/‎demos/yinxiang_markdown.resources/5BB98FD9-8FA4-481F-AF4E-E3B1F2DD38BC.png
662 KB
diff --git a/‎demos/yinxiang_md.html
Lines changed: 0 additions & 35 deletions b/‎demos/yinxiang_md.html
Lines changed: 0 additions & 35 deletions
diff --git a/‎examples/process_md.ipynb
Lines changed: 184 additions & 0 deletions b/‎examples/process_md.ipynb
Lines changed: 184 additions & 0 deletions
diff --git a/‎html2notion/translate/html2json.py
Lines changed: 30 additions & 21 deletions b/‎html2notion/translate/html2json.py
Lines changed: 30 additions & 21 deletions
diff --git a/‎html2notion/translate/html2json_base.py
Lines changed: 34 additions & 5 deletions b/‎html2notion/translate/html2json_base.py
Lines changed: 34 additions & 5 deletions
diff --git a/‎html2notion/translate/html2json_clipper.py
Lines changed: 2 additions & 2 deletions b/‎html2notion/translate/html2json_clipper.py
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "md_text = \"\"\"\n",
+    "# Header\n",
+    "\n",
+    "**bold**, _ite_, ~~other~~, more...\n",
+    "`inline code` here.\n",
+    "\n",
+    "```python\n",
+    "import os\n",
+    "os.print('hello')\n",
+    "```\n",
+    "\n",
+    "> Please work through this document in its entirety to better understand how OpenAI’s rate limit system works. We include code examples and possible solutions to handle common issues. It is recommended to **follow** this guidance before filling out the [Rate Limit Increase Request form](https://docs.google.com/forms/d/e/1FAIpQLSc6gSL3zfHFlL6gNIyUcjkEv29jModHGxg5_XGyr-PrE2LaHw/viewform) with details regarding how to fill it out in the last section.\n",
+    "\n",
+    "divider\n",
+    "* * *\n",
+    "\n",
+    "### image\n",
+    "local images:\n",
+    "\n",
+    "![846f62a6516227df1b4370aea3f63143.png](evernotecid://A2B91148-7880-4D85-A7CC-3A794B21D0F8/appyinxiangcom/186128/ENResource/p3511)\n",
+    "\n",
+    "web image:\n",
+    "![pic](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/notion_templage.png)\n",
+    "\n",
+    "[link](https://docs.microsoft.com/zh-tw/previous-versions/visualstudio/design-tools/expression-studio-2/cc294571(v=expression.10))\n",
+    "\n",
+    "### Table\n",
+    "\n",
+    "|header| column1 | column 2\n",
+    "|-|-|-\n",
+    "|row 1| row 1_1 | row 1_2\n",
+    "|row 2| row 2_2 **bold**, _ite_, ~~other~~, more... | row 2_3\n",
+    "\n",
+    "### list\n",
+    "\n",
+    "[Why do we have rate limits?](https://platform.openai.com/docs/guides/rate-limits/overview)\n",
+    "Rate limits are a common practice for APIs, and they're put in place for a few different reasons:\n",
+    "\n",
+    "- They help protect against abuse or misuse of the API. For example, a malicious actor could flood the API with requests in an attempt to overload it or cause disruptions in service. By setting rate limits, `OpenAI` can prevent this kind of activity.\n",
+    "- Rate limits help ensure that everyone has fair access to the API. If one person or organization makes an excessive number of requests, it could bog down the API for everyone else. By throttling the number of requests that a single user can make, OpenAI ensures that the most number of people have an opportunity to use the API without experiencing slowdowns.\n",
+    "- Rate limits can help OpenAI manage the aggregate load on its infrastructure. If requests to the API increase dramatically, it could tax the servers and cause performance issues. By setting rate limits, OpenAI can help maintain a smooth and consistent experience for all users.\n",
+    "\n",
+    "number list\n",
+    "\n",
+    "1. number list1\n",
+    "2. numner list2\n",
+    "\n",
+    "## checkbox\n",
+    "\n",
+    "Three frogs\n",
+    "* [x] The first frog\n",
+    "* [ ] The second frog\n",
+    "* [ ] The third frog\n",
+    "\n",
+    "# math and grapth\n",
+    "\n",
+    "Here is math\n",
+    "```math\n",
+    "e^{i\\pi} + 1 = 0\n",
+    "```\n",
+    "\n",
+    "mermaid grapth:\n",
+    "\n",
+    "```mermaid\n",
+    "graph TD\n",
+    "A[Module A] -->|A1| B( Module B)\n",
+    "B --> C{Confidition C}\n",
+    "C -->|condition C1| D[Module D]\n",
+    "C -->|condition C2| E[Module E]\n",
+    "C -->|condition C3| F[Module F]\n",
+    "```\n",
+    "\n",
+    "sequenceDiagram\n",
+    "\n",
+    "```mermaid\n",
+    "sequenceDiagram\n",
+    "A->>B: Have you received a message?\n",
+    "B-->>A: Message received\n",
+    "```\n",
+    "\n",
+    "gantt\n",
+    "\n",
+    "```mermaid\n",
+    "gantt\n",
+    "title Gantt chart\n",
+    "dateFormat  YYYY-MM-DD\n",
+    "section Proj A\n",
+    "Task 1           :a1, 2018-06-06, 30d\n",
+    "Task 2     :after a1  , 20d\n",
+    "section Proj B\n",
+    "Task 3      :2018-06-12  , 12d\n",
+    "Task 4      : 24d\n",
+    "```\n",
+    "\n",
+    "### chart\n",
+    "\n",
+    "```chart\n",
+    ", budget, income, expenses, debt\n",
+    "June,5000,8000,4000,6000\n",
+    "July,3000,1000,4000,3000\n",
+    "Aug,5000,7000,6000,3000\n",
+    "Sep,7000,2000,3000,1000\n",
+    "Oct,6000,5000,4000,2000\n",
+    "Nov,4000,3000,5000,\n",
+    "\n",
+    "type: pie\n",
+    "title: 每月收益\n",
+    "x.title: Amount\n",
+    "y.title: Month\n",
+    "y.suffix: $\n",
+    "```\n",
+    "\n",
+    "```chart\n",
+    ",Budget,Income,Expenses,Debt\n",
+    "June,5000,8000,4000,6000\n",
+    "July,3000,1000,4000,3000\n",
+    "Aug,5000,7000,6000,3000\n",
+    "Sep,7000,2000,3000,1000\n",
+    "Oct,6000,5000,4000,2000\n",
+    "Nov,4000,3000,5000,\n",
+    "\n",
+    "type: line\n",
+    "title: Monthly Revenue\n",
+    "x.title: Amount\n",
+    "y.title: Month\n",
+    "y.suffix: $\n",
+    "```\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def extract_code_blocks(md_text):\n",
+    "    code_pattern = re.compile(r'```(\\w+)?\\n(.*?)```', re.DOTALL)\n",
+    "    matches = code_pattern.findall(md_text)\n",
+    "    code_blocks = [{'language': match[0], 'code': match[1]} for match in matches]\n",
+    "    return code_blocks\n",
+    "\n",
+    "\n",
+    "code_blocks = extract_code_blocks(md_text)\n",
+    "\n",
+    "for block in code_blocks:\n",
+    "    print(f\"Language: {block['language']}\")\n",
+    "    print(f\"Code: {block['code']}\\n\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "notion",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -9,6 +9,7 @@
 from ..translate.html2json_default import Default_Type
 from ..translate.html2json_yinxiang import YinXiang_Type
 from ..translate.html2json_clipper import YinXiangClipper_Type
+from ..translate.html2json_markdown import YinXiangMarkdown_Type
 
 
 """
@@ -17,14 +18,8 @@
 <meta name="source" content="mobile.android"/>
 """
 def _is_yinxiang_export_html(html_soup):
-    exporter_version_meta = html_soup.select_one('html > head > meta[name="exporter-version"]')
     meta_source = html_soup.select_one('html > head > meta[name="source"]')
-    exporter_version_content = exporter_version_meta.get( 'content', "") if isinstance(exporter_version_meta, Tag) else ""
-
     meta_source_content = meta_source.get('content', "") if isinstance(meta_source, Tag) else ""
-    if isinstance(exporter_version_content, str) and not exporter_version_content.startswith("Evernote"):
-        return False
-
     yinxiang_source_content = ["yinxiang", "desktop", "mobile"]
     for prefix in yinxiang_source_content:
         if isinstance(meta_source_content, str) and meta_source_content.startswith(prefix):
@@ -37,28 +32,42 @@ def _is_yinxiang_export_html(html_soup):
 <meta name="source-application" content="微信" />
 """
 def _is_yinxiang_clipper_html(html_soup):
-    exporter_version_meta = html_soup.select_one('html > head > meta[name="exporter-version"]')
-    exporter_version_content = exporter_version_meta.get(
-        'content', "") if isinstance(
-        exporter_version_meta, Tag) else ""
-
-    if isinstance(exporter_version_content, str) and not exporter_version_content.startswith("Evernote"):
-        return False
-    clipper_source_meta = html_soup.select_one('html > head > meta[name="source-application"]')
-    clipper_source_content = clipper_source_meta.get('content', "") if isinstance(clipper_source_meta, Tag) else ""
-    if isinstance(clipper_source_content, str) and clipper_source_content.endswith("evernote"):
+    meta_source_application = html_soup.select_one('html > head > meta[name="source-application"]')
+    source_application = meta_source_application.get('content', "") if isinstance(meta_source_application, Tag) else ""
+    if isinstance(source_application, str) and source_application.endswith("evernote"):
         return True
-    if isinstance(clipper_source_content, str) and clipper_source_content in ["微信",]:
+    if isinstance(source_application, str) and source_application in ["微信",]:
+        return True
+    return False
+
+
+"""
+<meta name="content-class" content="yinxiang.markdown" />
+"""
+def _is_yinxiang_markdown_html(html_soup):
+    meta_content_class = html_soup.select_one('html > head > meta[name="content-class"]')
+    content_class = meta_content_class.get('content', "") if isinstance(meta_content_class, Tag) else ""
+    if isinstance(content_class, str) and content_class.endswith("markdown"):
         return True
     return False
 
 
 def _infer_input_type(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
-    if _is_yinxiang_clipper_html(soup):
-        return YinXiangClipper_Type
-    elif _is_yinxiang_export_html(soup):
-        return YinXiang_Type
+    exporter_version_meta = soup.select_one('html > head > meta[name="exporter-version"]')
+    exporter_version_content = exporter_version_meta.get(
+        'content', "") if isinstance(
+        exporter_version_meta, Tag) else ""
+
+    # yinxiang export
+    if isinstance(exporter_version_content, str) and exporter_version_content.startswith("Evernote"):
+        if _is_yinxiang_markdown_html(soup):
+            return YinXiangMarkdown_Type
+        if _is_yinxiang_clipper_html(soup):
+            return YinXiangClipper_Type
+        elif _is_yinxiang_export_html(soup):
+            return YinXiang_Type
+
     return Default_Type
 
 
 
@@ -16,6 +16,7 @@ class Block(Enum):
     DIVIDER = "divider"
     TABLE = "table"
     TO_DO = "to_do"
+    EQUATION = "equation"
 
 class Html2JsonBase:
     _registry = {}
@@ -28,6 +29,17 @@ class Html2JsonBase:
         "color": str,
     }
 
+    _language = {"abap", "agda", "arduino",
+    "assembly", "bash", "basic", "bnf", "c", "c#", "c++", "clojure", "coffeescript", "coq", "css",
+    "dart", "dhall", "diff", "docker", "ebnf", "elixir", "elm", "erlang", "f#", "flow", "fortran",
+    "gherkin", "glsl", "go", "graphql", "groovy", "haskell", "html", "idris", "java", "javascript",
+    "json", "julia", "kotlin", "latex", "less", "lisp", "livescript", "llvm ir", "lua", "makefile",
+    "markdown", "markup", "matlab", "mathematica", "mermaid", "nix", "objective-c", "ocaml", "pascal",
+    "perl", "php", "plain text", "powershell", "prolog", "protobuf", "purescript", "python", "r",
+    "racket", "reason", "ruby", "rust", "sass", "scala", "scheme", "scss", "shell", "solidity", "sql",
+    "swift", "toml", "typescript", "vb.net", "verilog", "vhdl", "visual basic", "webassembly", "xml",
+    "yaml", "java/c/c++/c#"}
+    
     _color_tuple = namedtuple("Color", "name r g b")
     _notion_color = [
         _color_tuple("default", 0, 0, 0),
@@ -92,11 +104,7 @@ def extract_text_and_parents(tag: PageElement, parents=[]):
     @staticmethod
     def parse_one_style(tag_soup: Tag, text_params: dict):
         tag_name = tag_soup.name.lower()
-        style = tag_soup.get('style', "")
-        styles = {}
-        if str and isinstance(style, str):
-            styles = {rule.split(':')[0].strip(): rule.split(':')[1].strip() for rule in style.split(';') if rule}
-
+        styles = Html2JsonBase.get_tag_style(tag_soup)
         if Html2JsonBase.is_bold(tag_name, styles):
             text_params["bold"] = True
         if Html2JsonBase.is_italic(tag_name, styles):
@@ -456,6 +464,27 @@ def convert_table(self, soup):
         }
         return table_obj
 
+    # Only if there is no ";" in the value of the attribute, you can use this method to get all attributes.
+    # Can't use this way like: background-image: url('data:image/png;base64...') 
+    @staticmethod
+    def get_tag_style(tag_soup):
+        style = tag_soup.get('style', "")
+        styles = {}
+        if str and isinstance(style, str):
+            # style = ''.join(style.split())
+            styles = {
+                rule.split(':')[0].strip(): rule.split(':')[1].strip().lower()
+                for rule in style.split(';')
+                if rule and len(rule.split(':')) > 1
+            }
+        return styles
+
+    @staticmethod
+    def get_valid_language(language):
+        if language in Html2JsonBase._language:
+            return language
+        return "plain text"
+    
     @classmethod
     def register(cls, input_type, subclass):
         cls._registry[input_type] = subclass
 
@@ -6,7 +6,7 @@
 YinXiangClipper_Type = "clipper.yinxiang"
 
 
-class Html2JsonYinXiang(Html2JsonBase):
+class Html2JsonClipper(Html2JsonBase):
     input_type = YinXiangClipper_Type
 
     def __init__(self, html_content, import_stat):
@@ -158,4 +158,4 @@ def _check_is_block(self, element):
         return False
 
 
-Html2JsonBase.register(YinXiangClipper_Type, Html2JsonYinXiang)
+Html2JsonBase.register(YinXiangClipper_Type, Html2JsonClipper)