diff --git a/python-markitdown/README.md b/python-markitdown/README.md new file mode 100644 index 0000000000..fb7cdd2dda --- /dev/null +++ b/python-markitdown/README.md @@ -0,0 +1,3 @@ +# Python MarkItDown: Convert Documents Into LLM-Ready Markdown + +This folder provides the code examples for the Real Python tutorial [Python MarkItDown: Convert Documents Into LLM-Ready Markdown](https://realpython.com/python-markitdown/). diff --git a/python-markitdown/batch_converter.py b/python-markitdown/batch_converter.py new file mode 100644 index 0000000000..73b436ecb8 --- /dev/null +++ b/python-markitdown/batch_converter.py @@ -0,0 +1,30 @@ +from pathlib import Path + +from markitdown import MarkItDown + + +def main( + input_dir, + output_dir="output", + target_formats=(".docx", ".xlsx", ".pdf"), +): + input_path = Path(input_dir) + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + md = MarkItDown() + + for file_path in input_path.glob("*"): + if file_path.suffix in target_formats: + try: + result = md.convert(file_path) + except Exception as e: + print(f"✗ Error converting {file_path.name}: {e}") + + output_file = output_path / f"{file_path.stem}.md" + output_file.write_text(result.markdown, encoding="utf-8") + print(f"✓ Converted {file_path.name} → {output_file.name}") + + +if __name__ == "__main__": + main("data", "output") diff --git a/python-markitdown/convert_files.py b/python-markitdown/convert_files.py new file mode 100644 index 0000000000..772ba7a05c --- /dev/null +++ b/python-markitdown/convert_files.py @@ -0,0 +1,5 @@ +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("./data/sample_DOCX.docx") +print(result) diff --git a/python-markitdown/data/pep8.docx b/python-markitdown/data/pep8.docx new file mode 100644 index 0000000000..c65da105a3 Binary files /dev/null and b/python-markitdown/data/pep8.docx differ diff --git a/python-markitdown/data/real-python.png b/python-markitdown/data/real-python.png new file mode 100644 index 0000000000..67571011ee Binary files /dev/null and b/python-markitdown/data/real-python.png differ diff --git a/python-markitdown/data/sample_CSV.csv b/python-markitdown/data/sample_CSV.csv new file mode 100644 index 0000000000..70a9ffc7fd --- /dev/null +++ b/python-markitdown/data/sample_CSV.csv @@ -0,0 +1,9 @@ +First Name,Last Name,Department,Position,Start Date +Alice,Johnson,Marketing,Marketing Coordinator,1/15/2022 +Bob,Williams,Human Resources,HR Generalist,6/1/2021 +Carol,Davis,Engineering,Software Engineer,3/20/2023 +David,Brown,Sales,Sales Representative,9/10/2022 +Eve,Miller,Finance,Financial Analyst,11/5/2021 +Frank,Garcia,Customer Service,Customer Support Specialist,7/1/2023 +Grace,Rodriguez,Research & Development,Research Scientist,4/25/2022 +Henry,Martinez,Operations,Operations Manager,2/14/2021 diff --git a/python-markitdown/data/sample_DOCX.docx b/python-markitdown/data/sample_DOCX.docx new file mode 100644 index 0000000000..547d9d3981 Binary files /dev/null and b/python-markitdown/data/sample_DOCX.docx differ diff --git a/python-markitdown/data/sample_PDF.pdf b/python-markitdown/data/sample_PDF.pdf new file mode 100644 index 0000000000..16ee9c2592 Binary files /dev/null and b/python-markitdown/data/sample_PDF.pdf differ diff --git a/python-markitdown/data/sample_XLSX.xlsx b/python-markitdown/data/sample_XLSX.xlsx new file mode 100644 index 0000000000..d319e5414c Binary files /dev/null and b/python-markitdown/data/sample_XLSX.xlsx differ diff --git a/python-markitdown/data/zen-of-python.png b/python-markitdown/data/zen-of-python.png new file mode 100644 index 0000000000..c72a9d368d Binary files /dev/null and b/python-markitdown/data/zen-of-python.png differ diff --git a/python-markitdown/img_description.py b/python-markitdown/img_description.py new file mode 100644 index 0000000000..4d90dd9c59 --- /dev/null +++ b/python-markitdown/img_description.py @@ -0,0 +1,15 @@ +import os + +from markitdown import MarkItDown +from openai import OpenAI + +api_key = os.getenv("OPENAI_API_KEY") + +client = OpenAI(api_key=api_key) +md = MarkItDown( + llm_client=client, + llm_model="gpt-4o", +) + +result = md.convert("./data/real-python.png") +print(result.markdown) diff --git a/python-markitdown/ocr.py b/python-markitdown/ocr.py new file mode 100644 index 0000000000..6213710a8a --- /dev/null +++ b/python-markitdown/ocr.py @@ -0,0 +1,16 @@ +import os + +from markitdown import MarkItDown +from openai import OpenAI + +api_key = os.getenv("OPENAI_API_KEY") + +client = OpenAI(api_key=api_key) +md = MarkItDown( + llm_client=client, + llm_model="gpt-4o", + llm_prompt="Extract text from image with OCR and return Markdown.", +) + +result = md.convert("./data/zen-of-python.png") +print(result.markdown)