-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimg2md_ollama.py
87 lines (70 loc) · 2.84 KB
/
img2md_ollama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import base64
import glob
import os
import sys
import argparse
import time
from dotenv import load_dotenv
from ollama import chat
from ollama import ChatResponse
from typing import Dict, List
from PIL import Image
from natsort import natsorted
import requests
def image_to_base64(image_path):
with open(image_path, "rb") as img_file:
return base64.b64encode(img_file.read()).decode('utf-8')
def ocr_by_ollama(image_path):
OLLAMA_MODEL = "llama3.2-vision:11b-instruct-q4_K_M"
# Ollama API endpoint
url = "http://localhost:11434/api/generate"
# 将图片转换为base64
image_base64 = image_to_base64(image_path)
# 构建请求数据
data = {
"model": OLLAMA_MODEL,
"prompt": "请OCR图片中的文字,注意识别文字中混排的emoji或其他日语常用的符号,表格中的文字也需要识别。不需要总结图片内容,只需要返回OCR的结果",
"stream": False,
"images": [image_base64]
}
try:
# 发送POST请求到Ollama
response = requests.post(url, json=data)
response.raise_for_status()
# 解析响应
result = response.json()
return result['response']
except requests.exceptions.RequestException as e:
return f"Error occurred: {str(e)}"
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Convert images(PNG or JPG) to ONE markdown file.')
parser.add_argument('img_folder', help='Folder for images.')
parser.add_argument('--trans_to', help='Translate the content to the specified language.')
args = parser.parse_args()
img_folder = args.img_folder
trans_to = args.trans_to
if not os.path.exists(img_folder):
print(f'The specified folder does not exist: {img_folder}')
print('Usage: python img2md.py <img_folder>')
print('e.g.: "python img2md.py ." for current folder')
sys.exit(1)
images = natsorted(glob.glob(f'{img_folder}/*.png') +
glob.glob(f'{img_folder}/*.jpg') +
glob.glob(f'{img_folder}/*.jpeg'))
print(f'{len(images)} images to process...')
output_file = f'{img_folder}_{int(time.time())}.md'
try:
with open(output_file, 'w') as f:
for img in images:
print(f'Processing {images.index(img) + 1} / {len(images)}')
text_content = ocr_by_ollama(img)
#markdown_output = format_to_markdown_ref_image(text_content, img)
if text_content:
f.write(text_content)
f.write('\n\n')
except Exception as e:
print(f"Error occurred while saving the Markdown file: {e}")
sys.exit(1)
print(f'Processing complete, saved to {output_file}')
if trans_to:
print(f'Translated doc saved to {output_file_trans}')