diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6e2b19b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +tmp_pdf_to_jpg diff --git a/README.md b/README.md index ad81c42..210d93f 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,6 @@ In either of these scripts, you can optionally specify an output directory for t ```bash export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials -pip install -r requirements.txt --upgrade ./transcribe.sh ``` diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..cec9082 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,3 @@ +* + +!.gitignore diff --git a/pdf2image_with_logging.py b/pdf2image_with_logging.py new file mode 100644 index 0000000..0dbdf41 --- /dev/null +++ b/pdf2image_with_logging.py @@ -0,0 +1,546 @@ +""" + pdf2image is a light wrapper for the poppler-utils tools that can convert your + PDFs into Pillow images. +""" +from typing import List + +import os +import platform +import tempfile +import types +import shutil +import pathlib +import subprocess +from subprocess import Popen, PIPE, TimeoutExpired +import concurrent.futures +from PIL import Image + +from itertools import chain + +from tqdm import tqdm + +from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator + +from pdf2image.parsers import ( + parse_buffer_to_pgm, + parse_buffer_to_ppm, + parse_buffer_to_jpeg, + parse_buffer_to_png, +) + +from pdf2image.exceptions import ( + PopplerNotInstalledError, + PDFInfoNotInstalledError, + PDFPageCountError, + PDFSyntaxError, + PDFPopplerTimeoutError, +) + +TRANSPARENT_FILE_TYPES = ["png", "tiff"] +PDFINFO_CONVERT_TO_INT = ["Pages"] + + +def convert_from_path( + pdf_path, + dpi=200, + output_folder=None, + first_page=None, + last_page=None, + fmt="ppm", + jpegopt=None, + thread_count=None, + userpw=None, + use_cropbox=False, + strict=False, + transparent=False, + single_file=False, + output_file=uuid_generator(), + poppler_path=None, + grayscale=False, + size=None, + paths_only=False, + use_pdftocairo=False, + timeout=None, + hide_annotations=False, +): + """ + Description: Convert PDF to Image will throw whenever one of the condition is reached + Parameters: + pdf_path -> Path to the PDF that you want to convert + dpi -> Image quality in DPI (default 200) + output_folder -> Write the resulting images to a folder (instead of directly in memory) + first_page -> First page to process + last_page -> Last page to process before stopping + fmt -> Output image format + jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format) + thread_count -> How many threads we are allowed to spawn for processing + userpw -> PDF's password + use_cropbox -> Use cropbox instead of mediabox + strict -> When a Syntax Error is thrown, it will be raised as an Exception + transparent -> Output with a transparent background instead of a white one. + single_file -> Uses the -singlefile option from pdftoppm/pdftocairo + output_file -> What is the output filename or generator + poppler_path -> Path to look for poppler binaries + grayscale -> Output grayscale image(s) + size -> Size of the resulting image(s), uses the Pillow (width, height) standard + paths_only -> Don't load image(s), return paths instead (requires output_folder) + use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance + timeout -> Raise PDFPopplerTimeoutError after the given time + """ + + if use_pdftocairo and fmt == "ppm": + fmt = "png" + + # We make sure that if passed arguments are Path objects, they're converted to strings + if isinstance(pdf_path, pathlib.PurePath): + pdf_path = pdf_path.as_posix() + + if isinstance(output_folder, pathlib.PurePath): + output_folder = output_folder.as_posix() + + if isinstance(poppler_path, pathlib.PurePath): + poppler_path = poppler_path.as_posix() + + page_count = pdfinfo_from_path(pdf_path, userpw, poppler_path=poppler_path)["Pages"] + # We start by getting the output format, the buffer processing function and if we need pdftocairo + parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format( + fmt, grayscale + ) + + # We use pdftocairo is the format requires it OR we need a transparent output + use_pdfcairo = ( + use_pdftocairo + or use_pdfcairo_format + or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES) + ) + + poppler_version_major, poppler_version_minor = _get_poppler_version( + "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path + ) + + if poppler_version_major == 0 and poppler_version_minor <= 57: + jpegopt = None + + if poppler_version_major == 0 and poppler_version_minor <= 83: + hide_annotations = False + + # If output_file isn't a generator, it will be turned into one + if not isinstance(output_file, types.GeneratorType) and not isinstance( + output_file, ThreadSafeGenerator + ): + if single_file: + output_file = iter([output_file]) + else: + output_file = counter_generator(output_file) + + # START thread count calculation modification + # The original default thread count allocation is 1 + + if thread_count is None: + thread_count = os.cpu_count() + + # END thread count calculation modification + + if thread_count < 1: + thread_count = 1 + + if first_page is None or first_page < 1: + first_page = 1 + + if last_page is None or last_page > page_count: + last_page = page_count + + if first_page > last_page: + return [] + + auto_temp_dir = False + if output_folder is None and use_pdfcairo: + auto_temp_dir = True + output_folder = tempfile.mkdtemp() + + # Recalculate page count based on first and last page + page_count = last_page - first_page + 1 + + if thread_count > page_count: + thread_count = page_count + + # Add poppler path to LD_LIBRARY_PATH + env = os.environ.copy() + if poppler_path is not None: + env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + + # Spawn the process and save its uuid + startupinfo=None + if platform.system() == 'Windows': + # this startupinfo structure prevents a console window from popping up on Windows + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + + # START faster multiprocessing code + + def process_single_page(current_page: int, thread_output_file: str) -> List: + uid = thread_output_file + + # Build the command accordingly + args = _build_command( + ["-r", str(dpi), pdf_path], + output_folder, + current_page, + current_page, + parsed_fmt, + jpegopt, + thread_output_file, + userpw, + use_cropbox, + transparent, + single_file, + grayscale, + size, + hide_annotations, + ) + + if use_pdfcairo: + if hide_annotations: + raise NotImplementedError("Hide annotations flag not implemented in pdftocairo.") + args = [_get_command_path("pdftocairo", poppler_path)] + args + else: + args = [_get_command_path("pdftoppm", poppler_path)] + args + + proc = Popen(args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo) + + try: + data, err = proc.communicate(timeout=timeout) + except TimeoutExpired: + proc.kill() + outs, errs = proc.communicate() + raise PDFPopplerTimeoutError("Run poppler poppler timeout.") + + if b"Syntax Error" in err and strict: + raise PDFSyntaxError(err.decode("utf8", "ignore")) + + if output_folder is not None: + current_images = _load_from_output_folder( + output_folder, uid, final_extension, paths_only, in_memory=auto_temp_dir + ) + else: + current_images = parse_buffer_func(data) + + return current_images + + page_numbers = list(range(first_page, first_page + page_count)) + + pbar = tqdm(total=page_count, desc="Converting pdf document to jpg images") + with concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) as executor: + futures = dict() + for page_number, thread_output_file in zip(page_numbers, output_file): + future = executor.submit(process_single_page, page_number, thread_output_file) + futures[future] = page_number + + nested_images_by_page_number = {} + for future in concurrent.futures.as_completed(futures): + page_number = futures[future] + nested_images_by_page_number[page_number] = future.result() + pbar.update(1) + pbar.close() + + images = list(chain.from_iterable([nested_images_by_page_number[i] for i in page_numbers])) + + # END faster multiprocessing code + + if auto_temp_dir: + shutil.rmtree(output_folder) + + return images + + +def convert_from_bytes( + pdf_file, + dpi=200, + output_folder=None, + first_page=None, + last_page=None, + fmt="ppm", + jpegopt=None, + thread_count=1, + userpw=None, + use_cropbox=False, + strict=False, + transparent=False, + single_file=False, + output_file=uuid_generator(), + poppler_path=None, + grayscale=False, + size=None, + paths_only=False, + use_pdftocairo=False, + timeout=None, + hide_annotations=False, +): + """ + Description: Convert PDF to Image will throw whenever one of the condition is reached + Parameters: + pdf_file -> Bytes representing the PDF file + dpi -> Image quality in DPI + output_folder -> Write the resulting images to a folder (instead of directly in memory) + first_page -> First page to process + last_page -> Last page to process before stopping + fmt -> Output image format + jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format) + thread_count -> How many threads we are allowed to spawn for processing + userpw -> PDF's password + use_cropbox -> Use cropbox instead of mediabox + strict -> When a Syntax Error is thrown, it will be raised as an Exception + transparent -> Output with a transparent background instead of a white one. + single_file -> Uses the -singlefile option from pdftoppm/pdftocairo + output_file -> What is the output filename or generator + poppler_path -> Path to look for poppler binaries + grayscale -> Output grayscale image(s) + size -> Size of the resulting image(s), uses the Pillow (width, height) standard + paths_only -> Don't load image(s), return paths instead (requires output_folder) + use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance + timeout -> Raise PDFPopplerTimeoutError after the given time + """ + + fh, temp_filename = tempfile.mkstemp() + try: + with open(temp_filename, "wb") as f: + f.write(pdf_file) + f.flush() + return convert_from_path( + f.name, + dpi=dpi, + output_folder=output_folder, + first_page=first_page, + last_page=last_page, + fmt=fmt, + jpegopt=jpegopt, + thread_count=thread_count, + userpw=userpw, + use_cropbox=use_cropbox, + strict=strict, + transparent=transparent, + single_file=single_file, + output_file=output_file, + poppler_path=poppler_path, + grayscale=grayscale, + size=size, + paths_only=paths_only, + use_pdftocairo=use_pdftocairo, + timeout=timeout, + hide_annotations=hide_annotations, + ) + finally: + os.close(fh) + os.remove(temp_filename) + + +def _build_command( + args, + output_folder, + first_page, + last_page, + fmt, + jpegopt, + output_file, + userpw, + use_cropbox, + transparent, + single_file, + grayscale, + size, + hide_annotations, +): + if use_cropbox: + args.append("-cropbox") + + if hide_annotations: + args.append("-hide-annotations") + + if transparent and fmt in TRANSPARENT_FILE_TYPES: + args.append("-transp") + + if first_page is not None: + args.extend(["-f", str(first_page)]) + + if last_page is not None: + args.extend(["-l", str(last_page)]) + + if fmt not in ["pgm", "ppm"]: + args.append("-" + fmt) + + if fmt in ["jpeg", "jpg"] and jpegopt: + args.extend(["-jpegopt", _parse_jpegopt(jpegopt)]) + + if single_file: + args.append("-singlefile") + + if output_folder is not None: + args.append(os.path.join(output_folder, output_file)) + + if userpw is not None: + args.extend(["-upw", userpw]) + + if grayscale: + args.append("-gray") + + if size is None: + pass + elif isinstance(size, tuple) and len(size) == 2: + if size[0] is not None: + args.extend(["-scale-to-x", str(int(size[0]))]) + else: + args.extend(["-scale-to-x", str(-1)]) + if size[1] is not None: + args.extend(["-scale-to-y", str(int(size[1]))]) + else: + args.extend(["-scale-to-y", str(-1)]) + elif isinstance(size, tuple) and len(size) == 1: + args.extend(["-scale-to", str(int(size[0]))]) + elif isinstance(size, int) or isinstance(size, float): + args.extend(["-scale-to", str(int(size))]) + else: + raise ValueError("Size {} is not a tuple or an integer") + + return args + + +def _parse_format(fmt, grayscale=False): + fmt = fmt.lower() + if fmt[0] == ".": + fmt = fmt[1:] + if fmt in ("jpeg", "jpg"): + return "jpeg", "jpg", parse_buffer_to_jpeg, False + if fmt == "png": + return "png", "png", parse_buffer_to_png, False + if fmt in ("tif", "tiff"): + return "tiff", "tif", None, True + if fmt == "ppm" and grayscale: + return "pgm", "pgm", parse_buffer_to_pgm, False + # Unable to parse the format so we'll use the default + return "ppm", "ppm", parse_buffer_to_ppm, False + + +def _parse_jpegopt(jpegopt): + parts = [] + for k, v in jpegopt.items(): + if v is True: + v = "y" + if v is False: + v = "n" + parts.append("{}={}".format(k, v)) + return ",".join(parts) + + +def _get_command_path(command, poppler_path=None): + if platform.system() == "Windows": + command = command + ".exe" + + if poppler_path is not None: + command = os.path.join(poppler_path, command) + + return command + + +def _get_poppler_version(command, poppler_path=None, timeout=None): + command = [_get_command_path(command, poppler_path), "-v"] + + env = os.environ.copy() + if poppler_path is not None: + env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE) + + try: + data, err = proc.communicate(timeout=timeout) + except TimeoutExpired: + proc.kill() + outs, errs = proc.communicate() + raise PDFPopplerTimeoutError("Run poppler poppler timeout.") + + try: + # TODO: Make this more robust + version = err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".") + return int(version[0]), int(version[1]) + except: + # Lowest version that includes pdftocairo (2011) + return 0, 17 + + +def pdfinfo_from_path( + pdf_path, userpw=None, poppler_path=None, rawdates=False, timeout=None +): + try: + command = [_get_command_path("pdfinfo", poppler_path), pdf_path] + + if userpw is not None: + command.extend(["-upw", userpw]) + + if rawdates: + command.extend(["-rawdates"]) + + # Add poppler path to LD_LIBRARY_PATH + env = os.environ.copy() + if poppler_path is not None: + env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "") + proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE) + + try: + out, err = proc.communicate(timeout=timeout) + except TimeoutExpired: + proc.kill() + outs, errs = proc.communicate() + raise PDFPopplerTimeoutError("Run poppler poppler timeout.") + + d = {} + for field in out.decode("utf8", "ignore").split("\n"): + sf = field.split(":") + key, value = sf[0], ":".join(sf[1:]) + if key != "": + d[key] = ( + int(value.strip()) + if key in PDFINFO_CONVERT_TO_INT + else value.strip() + ) + + if "Pages" not in d: + raise ValueError + + return d + + except OSError: + raise PDFInfoNotInstalledError( + "Unable to get page count. Is poppler installed and in PATH?" + ) + except ValueError: + raise PDFPageCountError( + "Unable to get page count.\n%s" % err.decode("utf8", "ignore") + ) + + +def pdfinfo_from_bytes( + pdf_file, userpw=None, poppler_path=None, rawdates=False, timeout=None +): + fh, temp_filename = tempfile.mkstemp() + try: + with open(temp_filename, "wb") as f: + f.write(pdf_file) + f.flush() + return pdfinfo_from_path(temp_filename, userpw=userpw, rawdates=rawdates, + poppler_path=poppler_path) + finally: + os.close(fh) + os.remove(temp_filename) + + +def _load_from_output_folder( + output_folder, output_file, ext, paths_only, in_memory=False +): + images = [] + for f in sorted(os.listdir(output_folder)): + if f.startswith(output_file) and f.split(".")[-1] == ext: + if paths_only: + images.append(os.path.join(output_folder, f)) + else: + images.append(Image.open(os.path.join(output_folder, f))) + if in_memory: + images[-1].load() + return images diff --git a/pdf_to_jpg_dir.py b/pdf_to_jpg_dir.py index 42481cf..2dd6c22 100644 --- a/pdf_to_jpg_dir.py +++ b/pdf_to_jpg_dir.py @@ -1,10 +1,12 @@ import argparse import math import os -import re import shutil +from posixpath import join -from pdf2image import convert_from_path +from tqdm import tqdm + +from pdf2image_with_logging import convert_from_path from google.cloud import storage parser = argparse.ArgumentParser() @@ -17,22 +19,24 @@ def convert_to_jpg(args): print("Converting file {} to jpg".format(args.filepath)) path_book_name = os.path.splitext(args.filepath)[0] book_name = os.path.split(path_book_name)[-1] - pages = convert_from_path(args.filepath, 500) - n = int(math.ceil(math.log(len(pages)) / math.log(10))) + page_paths = convert_from_path( + args.filepath, 500, output_folder="tmp_pdf_to_jpg", fmt="jpg", paths_only=True + ) + + n = int(math.ceil(math.log(len(page_paths)) / math.log(10))) storage_client = storage.Client() - bucket = storage_client.get_bucket(bucket_name=args.bucket) + bucket = storage_client.get_bucket(args.bucket) + + gcs_dir = args.output_dir if args.output_dir else book_name - for i, page in enumerate(pages): + for i, page_path in tqdm(enumerate(page_paths), desc="Uploading", total=len(page_paths)): number_str = "_{num:0{width}}.jpg".format(num=i+1, width=n) - output_name = os.path.join("tmp_pdf_to_jpg", book_name + number_str) - page.save(output_name, 'JPEG') - gcs_dir = args.output_dir if args.output_dir else book_name - blob = bucket.blob(os.path.join(gcs_dir, book_name + number_str)) - print("Uploading file to Google Cloud bucket: {}".format(output_name)) - blob.upload_from_filename(output_name) + gcs_path = join(gcs_dir, book_name + number_str) + blob = bucket.blob(gcs_path) + blob.upload_from_filename(page_path) if __name__ == "__main__": @@ -40,6 +44,7 @@ def convert_to_jpg(args): os.mkdir("tmp_pdf_to_jpg") try: convert_to_jpg(args) - except: - pass - shutil.rmtree("tmp_pdf_to_jpg") \ No newline at end of file + except Exception as e: + raise e + finally: + shutil.rmtree("tmp_pdf_to_jpg") diff --git a/requirements.txt b/requirements.txt index b0a6e7e..354ab22 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ google-cloud-vision google-cloud-storage tqdm +pdf2image diff --git a/transcribe.sh b/transcribe.sh index 6acada2..7c59de8 100644 --- a/transcribe.sh +++ b/transcribe.sh @@ -1,39 +1,16 @@ # Data from https://drive.google.com/drive/u/3/folders/14x2_3ZS-XguLldWWZW5uGBYViX8kJIPw -# Make sure to use your own bucket, you will not have access to this one -bucket_name="ocr-tutorial-20220901" +# Install necessary requirements +pip -q -q install -r requirements.txt --upgrade +# We need to install poppler for converting pdfs to images +conda install -c conda-forge poppler -cloud_paths=( - "1-Jinpa-Thupten_2004_Theg-pa-chen-po-blo-sbyong-brgya-rtsa_LOTC-1 (1).pdf" - "2-Jinpa_2005_Bka-gdams-glegs-bam-las-btus-pa_i-chos-skor_LOTC-2.pdf" - "3-Jinpa_2005_rdzogs-pa-chen-po-sems-nyid-ngal-gso_i-_grel-pa-shing-rta-chen-po_LOTC-3.pdf" - "4-Jinpa_2004_dpal-sakya-pa_i-lam-_bras-kyi-chos-skor-gces-btus_LOTC-4.pdf" - "5-Jinpa_2008_mnyam-med-bka_-brgyud-lugs-kyi-phyag-rgya-chen-po-dang-_brel-ba_i-chos-skor_LOTC-5.pdf" - "6-Jinpa_2005_dpal-dge-ldan-pa_i-lam-rim-dang-snyan-brgyud-kyi-chos-skor_LOTC-6.pdf" - "7-Jinpa-JO-NANG-CHOS-SKOR-RI-CHOS-NGES-DON-RGYA-MTSO-LoTC-VOL-7.pdf" - "8-Jinpa-ZHI-BYED-DANG-GCOD-YUL-SOGS-GDAMS-NGAG-THOR-BU_I-CHOS-SKOR-LoTC.pdf" - "9-Jinpa-gyung-drung-bon-gyi-mdo-sngags-sems-gsum-gyi-gzhung.pdf" - "10-Jinpa-bstan-pa-la-_jug-pa_i-rim-pa-son-pa_i-gzhung-gces-btus.pdf" - "11-Jinpa-RGYAL-SRAS-KYI-SPYOD-PA-LA-_JUG-PA_I-CHOS-SKOR.pdf" - "13-Jinpa-RNAL-_BYOR-BLA-MED-KYI-SGRUB-THABS-DANG-BSKYED-RIM-GYI-KHRID-YIG-KHAG-LoTC-VOL-13.pdf" - "14-Jinpa-DUS-_KHOR-_GREL-CHEN-DRI-MED-_OD-KYI-RGYAN-LoTC-VOL-14.pdf" - "15-Jinpa-RDZOGS-RIM-RIM-LNGA-GSAL-SGRON-LoTC-VOL-15-Scan.pdf" - "17-Jinpa-2007_bde-gshegs-snying-po-rigs-kyi-chos-skor_Institute-for-Tibetan-Classics.pdf" - "19-Jinpa-DBU-MA-DGONGS-PA-RAB-GSAL-LoTC-VOL-19-Scan.pdf" - "20-Jinpa-DPAL-SA-SKYA-PA_I-TSAD-MA-RIG-PA_I-GZHUNG-GCES-BTUS-LoTC-VOL-20-Scan.pdf" - "21-Jinpa-DPAL-DGE-LDAN-PA_I-TSAD-MA-RIG-PA_I-GZHUNG-GCES-BTUS-LoTC-VOL-21-Scan.pdf" - "23-Jinpa-CHOS-MNGON-PA-MDZOD-KYI-_GREL-PA-MNGON-PA_I-RGYAN-LoTC-VOL-23-Scan.pdf" - "24-Jinpa-GRUB-MTHA-THUB-BSTAN-LHUN-PO_I-MDZES-RGYAN-LoTC-VOL-24-Scan.pdf" - "25-Jinpa-GRUB-MTHA-SHEL-GYI-ME-LONG-LoTC-VOL-25-Scan.pdf" - "27-Jinpa-LEGS-BSHAD-LUGS-KYI-BSLAB-BYA_I-DPE-TSOGS-LoTC-VOL-27-Scan.pdf" - "28-Jingpa-BDE-GSHEGS-SNYING-PO-RIGS-KYI-CHOS-SKOR-LoTC-VOL-28-Scan.pdf" - "29-Jinpa-RTSIS-KYI-MAN-NGAG-NYIN-BYED-SNANG-BA_I-RANAM-_GREL-GSER-GYI-SHING-RTA-LoTC-VOL-29-Scan.pdf" - "31-Jingpa-BOD-KYI-LHA-MO_I-KHRAB-GZHUNG-CHE-KHAG-GCES-BTUS-LoTC-VOL-31-Scan.pdf" - "32-Jinpa-MKHAS-PA-LDE_US-MDZAD-PA_I-RGYA-BOD-KYI-CHOS-_BYUNG-RGYAS-PA-LoTC-VOL-32-Scan.pdf" -) +# Make sure to use your own bucket, you will not have access to this one +cloud_bucket_name="ocr-tutorial-20220901" -for cloud_path in ${cloud_paths[*]} +for local_path in data/*.pdf do - python transcribe_bo-fo.py --output-dir results --filepath "$cloud_path" --bucket "$bucket_name" + python pdf_to_jpg_dir.py --filepath "$local_path" --bucket $cloud_bucket_name + python transcribe_image_dir.py --filepath "${local_path#*/}" --bucket $cloud_bucket_name --output-dir results done diff --git a/transcribe_image_dir.py b/transcribe_image_dir.py index 416cc74..a0390c2 100644 --- a/transcribe_image_dir.py +++ b/transcribe_image_dir.py @@ -1,6 +1,8 @@ import argparse import os +from tqdm import trange + from google.cloud import storage from google.cloud import vision @@ -14,31 +16,47 @@ def detect_document_tibetan(args): """Detects text in all images in a folder located in Google Cloud Storage. """ book_name = os.path.split(args.filepath)[-1] - + folder_name = os.path.splitext(book_name)[0] + storage_client = storage.Client() bucket = storage_client.get_bucket(args.bucket) - blob_list = sorted([blob.name for blob in bucket.list_blobs(prefix=args.filepath+"/")]) + blob_list = sorted([blob.name for blob in bucket.list_blobs(prefix=folder_name+"/")]) - outputs = [] + # Setup calls to transcription + client = vision.ImageAnnotatorClient() + image_context = vision.ImageContext(language_hints=["bo"]) + feature = vision.Feature( + type=vision.Feature.Type.DOCUMENT_TEXT_DETECTION, + model="builtin/weekly") - for name in blob_list: + def create_annotate_image_request(name: str) -> vision.AnnotateImageRequest: + # Build Image URI in GCS gcs_uri = "gs://" + args.bucket + "/" + name - print("Running document text detection on: {}".format(gcs_uri)) - client = vision.ImageAnnotatorClient() - image = vision.types.Image() - image_context = vision.types.ImageContext(language_hints=["bo"]) + image = vision.Image() image.source.image_uri = gcs_uri - response = client.document_text_detection(image=image, image_context=image_context) - text = response.full_text_annotation.text - outputs.append(text) + # Create image annotation request + annotate_image_request = vision.AnnotateImageRequest( + image=image, image_context=image_context, features=[feature] + ) + + return annotate_image_request + + annotate_image_requests = list(map(create_annotate_image_request, blob_list)) + + batch_size = 10 + outputs = [] + for i in trange(0, len(annotate_image_requests), batch_size, desc="Running document text detection"): + batch_annotate_image_requests = annotate_image_requests[i: i+batch_size] + response = client.batch_annotate_images(requests=batch_annotate_image_requests) + outputs.extend([r.full_text_annotation.text for r in response.responses]) output_name = os.path.join(args.output_dir, book_name+".txt") if args.output_dir else book_name+".txt" print("Writing output file to: {}".format(output_name)) - with open(output_name, "w") as f: + with open(output_name, "w", encoding="utf-8") as f: f.write("".join(outputs)) - + if __name__ == "__main__": args = parser.parse_args()