diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..6e2b19b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+tmp_pdf_to_jpg
diff --git a/README.md b/README.md
index ad81c42..210d93f 100644
--- a/README.md
+++ b/README.md
@@ -82,7 +82,6 @@ In either of these scripts, you can optionally specify an output directory for t
 ```bash
 export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials
 
-pip install -r requirements.txt --upgrade
 ./transcribe.sh
 ```
 
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..cec9082
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,3 @@
+*
+
+!.gitignore
diff --git a/pdf2image_with_logging.py b/pdf2image_with_logging.py
new file mode 100644
index 0000000..0dbdf41
--- /dev/null
+++ b/pdf2image_with_logging.py
@@ -0,0 +1,546 @@
+"""
+    pdf2image is a light wrapper for the poppler-utils tools that can convert your
+    PDFs into Pillow images.
+"""
+from typing import List
+
+import os
+import platform
+import tempfile
+import types
+import shutil
+import pathlib
+import subprocess
+from subprocess import Popen, PIPE, TimeoutExpired
+import concurrent.futures
+from PIL import Image
+
+from itertools import chain
+
+from tqdm import tqdm
+
+from pdf2image.generators import uuid_generator, counter_generator, ThreadSafeGenerator
+
+from pdf2image.parsers import (
+    parse_buffer_to_pgm,
+    parse_buffer_to_ppm,
+    parse_buffer_to_jpeg,
+    parse_buffer_to_png,
+)
+
+from pdf2image.exceptions import (
+    PopplerNotInstalledError,
+    PDFInfoNotInstalledError,
+    PDFPageCountError,
+    PDFSyntaxError,
+    PDFPopplerTimeoutError,
+)
+
+TRANSPARENT_FILE_TYPES = ["png", "tiff"]
+PDFINFO_CONVERT_TO_INT = ["Pages"]
+
+
+def convert_from_path(
+    pdf_path,
+    dpi=200,
+    output_folder=None,
+    first_page=None,
+    last_page=None,
+    fmt="ppm",
+    jpegopt=None,
+    thread_count=None,
+    userpw=None,
+    use_cropbox=False,
+    strict=False,
+    transparent=False,
+    single_file=False,
+    output_file=uuid_generator(),
+    poppler_path=None,
+    grayscale=False,
+    size=None,
+    paths_only=False,
+    use_pdftocairo=False,
+    timeout=None,
+    hide_annotations=False,
+):
+    """
+        Description: Convert PDF to Image will throw whenever one of the condition is reached
+        Parameters:
+            pdf_path -> Path to the PDF that you want to convert
+            dpi -> Image quality in DPI (default 200)
+            output_folder -> Write the resulting images to a folder (instead of directly in memory)
+            first_page -> First page to process
+            last_page -> Last page to process before stopping
+            fmt -> Output image format
+            jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format)
+            thread_count -> How many threads we are allowed to spawn for processing
+            userpw -> PDF's password
+            use_cropbox -> Use cropbox instead of mediabox
+            strict -> When a Syntax Error is thrown, it will be raised as an Exception
+            transparent -> Output with a transparent background instead of a white one.
+            single_file -> Uses the -singlefile option from pdftoppm/pdftocairo
+            output_file -> What is the output filename or generator
+            poppler_path -> Path to look for poppler binaries
+            grayscale -> Output grayscale image(s)
+            size -> Size of the resulting image(s), uses the Pillow (width, height) standard
+            paths_only -> Don't load image(s), return paths instead (requires output_folder)
+            use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance
+            timeout -> Raise PDFPopplerTimeoutError after the given time
+    """
+
+    if use_pdftocairo and fmt == "ppm":
+        fmt = "png"
+
+    # We make sure that if passed arguments are Path objects, they're converted to strings
+    if isinstance(pdf_path, pathlib.PurePath):
+        pdf_path = pdf_path.as_posix()
+
+    if isinstance(output_folder, pathlib.PurePath):
+        output_folder = output_folder.as_posix()
+
+    if isinstance(poppler_path, pathlib.PurePath):
+        poppler_path = poppler_path.as_posix()
+
+    page_count = pdfinfo_from_path(pdf_path, userpw, poppler_path=poppler_path)["Pages"]
+    # We start by getting the output format, the buffer processing function and if we need pdftocairo
+    parsed_fmt, final_extension, parse_buffer_func, use_pdfcairo_format = _parse_format(
+        fmt, grayscale
+    )
+
+    # We use pdftocairo is the format requires it OR we need a transparent output
+    use_pdfcairo = (
+        use_pdftocairo
+        or use_pdfcairo_format
+        or (transparent and parsed_fmt in TRANSPARENT_FILE_TYPES)
+    )
+
+    poppler_version_major, poppler_version_minor = _get_poppler_version(
+        "pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
+    )
+
+    if poppler_version_major == 0 and poppler_version_minor <= 57:
+        jpegopt = None
+
+    if poppler_version_major == 0 and poppler_version_minor <= 83:
+        hide_annotations = False
+
+    # If output_file isn't a generator, it will be turned into one
+    if not isinstance(output_file, types.GeneratorType) and not isinstance(
+        output_file, ThreadSafeGenerator
+    ):
+        if single_file:
+            output_file = iter([output_file])
+        else:
+            output_file = counter_generator(output_file)
+
+    # START thread count calculation modification
+    # The original default thread count allocation is 1
+
+    if thread_count is None:
+        thread_count = os.cpu_count()
+
+    # END thread count calculation modification
+
+    if thread_count < 1:
+        thread_count = 1
+
+    if first_page is None or first_page < 1:
+        first_page = 1
+
+    if last_page is None or last_page > page_count:
+        last_page = page_count
+
+    if first_page > last_page:
+        return []
+
+    auto_temp_dir = False
+    if output_folder is None and use_pdfcairo:
+        auto_temp_dir = True
+        output_folder = tempfile.mkdtemp()
+
+    # Recalculate page count based on first and last page
+    page_count = last_page - first_page + 1
+
+    if thread_count > page_count:
+        thread_count = page_count
+
+    # Add poppler path to LD_LIBRARY_PATH
+    env = os.environ.copy()
+    if poppler_path is not None:
+        env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
+
+    # Spawn the process and save its uuid
+    startupinfo=None
+    if platform.system() == 'Windows':
+        # this startupinfo structure prevents a console window from popping up on Windows
+        startupinfo = subprocess.STARTUPINFO()
+        startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
+
+    # START faster multiprocessing code
+
+    def process_single_page(current_page: int, thread_output_file: str) -> List:
+        uid = thread_output_file
+
+        # Build the command accordingly
+        args = _build_command(
+            ["-r", str(dpi), pdf_path],
+            output_folder,
+            current_page,
+            current_page,
+            parsed_fmt,
+            jpegopt,
+            thread_output_file,
+            userpw,
+            use_cropbox,
+            transparent,
+            single_file,
+            grayscale,
+            size,
+            hide_annotations,
+        )
+
+        if use_pdfcairo:
+            if hide_annotations:
+                raise NotImplementedError("Hide annotations flag not implemented in pdftocairo.")
+            args = [_get_command_path("pdftocairo", poppler_path)] + args
+        else:
+            args = [_get_command_path("pdftoppm", poppler_path)] + args
+
+        proc = Popen(args, env=env, stdout=PIPE, stderr=PIPE, startupinfo=startupinfo)
+
+        try:
+            data, err = proc.communicate(timeout=timeout)
+        except TimeoutExpired:
+            proc.kill()
+            outs, errs = proc.communicate()
+            raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
+
+        if b"Syntax Error" in err and strict:
+            raise PDFSyntaxError(err.decode("utf8", "ignore"))
+
+        if output_folder is not None:
+            current_images = _load_from_output_folder(
+                output_folder, uid, final_extension, paths_only, in_memory=auto_temp_dir
+            )
+        else:
+            current_images = parse_buffer_func(data)
+
+        return current_images
+
+    page_numbers = list(range(first_page, first_page + page_count))
+
+    pbar = tqdm(total=page_count, desc="Converting pdf document to jpg images")
+    with concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) as executor:
+        futures = dict()
+        for page_number, thread_output_file in zip(page_numbers, output_file):
+            future = executor.submit(process_single_page, page_number, thread_output_file)
+            futures[future] = page_number
+
+        nested_images_by_page_number = {}
+        for future in concurrent.futures.as_completed(futures):
+            page_number = futures[future]
+            nested_images_by_page_number[page_number] = future.result()
+            pbar.update(1)
+    pbar.close()
+
+    images = list(chain.from_iterable([nested_images_by_page_number[i] for i in page_numbers]))
+
+    # END faster multiprocessing code
+
+    if auto_temp_dir:
+        shutil.rmtree(output_folder)
+
+    return images
+
+
+def convert_from_bytes(
+    pdf_file,
+    dpi=200,
+    output_folder=None,
+    first_page=None,
+    last_page=None,
+    fmt="ppm",
+    jpegopt=None,
+    thread_count=1,
+    userpw=None,
+    use_cropbox=False,
+    strict=False,
+    transparent=False,
+    single_file=False,
+    output_file=uuid_generator(),
+    poppler_path=None,
+    grayscale=False,
+    size=None,
+    paths_only=False,
+    use_pdftocairo=False,
+    timeout=None,
+    hide_annotations=False,
+):
+    """
+        Description: Convert PDF to Image will throw whenever one of the condition is reached
+        Parameters:
+            pdf_file -> Bytes representing the PDF file
+            dpi -> Image quality in DPI
+            output_folder -> Write the resulting images to a folder (instead of directly in memory)
+            first_page -> First page to process
+            last_page -> Last page to process before stopping
+            fmt -> Output image format
+            jpegopt -> jpeg options `quality`, `progressive`, and `optimize` (only for jpeg format)
+            thread_count -> How many threads we are allowed to spawn for processing
+            userpw -> PDF's password
+            use_cropbox -> Use cropbox instead of mediabox
+            strict -> When a Syntax Error is thrown, it will be raised as an Exception
+            transparent -> Output with a transparent background instead of a white one.
+            single_file -> Uses the -singlefile option from pdftoppm/pdftocairo
+            output_file -> What is the output filename or generator
+            poppler_path -> Path to look for poppler binaries
+            grayscale -> Output grayscale image(s)
+            size -> Size of the resulting image(s), uses the Pillow (width, height) standard
+            paths_only -> Don't load image(s), return paths instead (requires output_folder)
+            use_pdftocairo -> Use pdftocairo instead of pdftoppm, may help performance
+            timeout -> Raise PDFPopplerTimeoutError after the given time
+    """
+
+    fh, temp_filename = tempfile.mkstemp()
+    try:
+        with open(temp_filename, "wb") as f:
+            f.write(pdf_file)
+            f.flush()
+            return convert_from_path(
+                f.name,
+                dpi=dpi,
+                output_folder=output_folder,
+                first_page=first_page,
+                last_page=last_page,
+                fmt=fmt,
+                jpegopt=jpegopt,
+                thread_count=thread_count,
+                userpw=userpw,
+                use_cropbox=use_cropbox,
+                strict=strict,
+                transparent=transparent,
+                single_file=single_file,
+                output_file=output_file,
+                poppler_path=poppler_path,
+                grayscale=grayscale,
+                size=size,
+                paths_only=paths_only,
+                use_pdftocairo=use_pdftocairo,
+                timeout=timeout,
+                hide_annotations=hide_annotations,
+            )
+    finally:
+        os.close(fh)
+        os.remove(temp_filename)
+
+
+def _build_command(
+    args,
+    output_folder,
+    first_page,
+    last_page,
+    fmt,
+    jpegopt,
+    output_file,
+    userpw,
+    use_cropbox,
+    transparent,
+    single_file,
+    grayscale,
+    size,
+    hide_annotations,
+):
+    if use_cropbox:
+        args.append("-cropbox")
+
+    if hide_annotations:
+        args.append("-hide-annotations")
+
+    if transparent and fmt in TRANSPARENT_FILE_TYPES:
+        args.append("-transp")
+
+    if first_page is not None:
+        args.extend(["-f", str(first_page)])
+
+    if last_page is not None:
+        args.extend(["-l", str(last_page)])
+
+    if fmt not in ["pgm", "ppm"]:
+        args.append("-" + fmt)
+
+    if fmt in ["jpeg", "jpg"] and jpegopt:
+        args.extend(["-jpegopt", _parse_jpegopt(jpegopt)])
+
+    if single_file:
+        args.append("-singlefile")
+
+    if output_folder is not None:
+        args.append(os.path.join(output_folder, output_file))
+
+    if userpw is not None:
+        args.extend(["-upw", userpw])
+
+    if grayscale:
+        args.append("-gray")
+
+    if size is None:
+        pass
+    elif isinstance(size, tuple) and len(size) == 2:
+        if size[0] is not None:
+            args.extend(["-scale-to-x", str(int(size[0]))])
+        else:
+            args.extend(["-scale-to-x", str(-1)])
+        if size[1] is not None:
+            args.extend(["-scale-to-y", str(int(size[1]))])
+        else:
+            args.extend(["-scale-to-y", str(-1)])
+    elif isinstance(size, tuple) and len(size) == 1:
+        args.extend(["-scale-to", str(int(size[0]))])
+    elif isinstance(size, int) or isinstance(size, float):
+        args.extend(["-scale-to", str(int(size))])
+    else:
+        raise ValueError("Size {} is not a tuple or an integer")
+
+    return args
+
+
+def _parse_format(fmt, grayscale=False):
+    fmt = fmt.lower()
+    if fmt[0] == ".":
+        fmt = fmt[1:]
+    if fmt in ("jpeg", "jpg"):
+        return "jpeg", "jpg", parse_buffer_to_jpeg, False
+    if fmt == "png":
+        return "png", "png", parse_buffer_to_png, False
+    if fmt in ("tif", "tiff"):
+        return "tiff", "tif", None, True
+    if fmt == "ppm" and grayscale:
+        return "pgm", "pgm", parse_buffer_to_pgm, False
+    # Unable to parse the format so we'll use the default
+    return "ppm", "ppm", parse_buffer_to_ppm, False
+
+
+def _parse_jpegopt(jpegopt):
+    parts = []
+    for k, v in jpegopt.items():
+        if v is True:
+            v = "y"
+        if v is False:
+            v = "n"
+        parts.append("{}={}".format(k, v))
+    return ",".join(parts)
+
+
+def _get_command_path(command, poppler_path=None):
+    if platform.system() == "Windows":
+        command = command + ".exe"
+
+    if poppler_path is not None:
+        command = os.path.join(poppler_path, command)
+
+    return command
+
+
+def _get_poppler_version(command, poppler_path=None, timeout=None):
+    command = [_get_command_path(command, poppler_path), "-v"]
+
+    env = os.environ.copy()
+    if poppler_path is not None:
+        env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
+    proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
+
+    try:
+        data, err = proc.communicate(timeout=timeout)
+    except TimeoutExpired:
+        proc.kill()
+        outs, errs = proc.communicate()
+        raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
+
+    try:
+        # TODO: Make this more robust
+        version = err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")
+        return int(version[0]), int(version[1])
+    except:
+        # Lowest version that includes pdftocairo (2011)
+        return 0, 17
+
+
+def pdfinfo_from_path(
+    pdf_path, userpw=None, poppler_path=None, rawdates=False, timeout=None
+):
+    try:
+        command = [_get_command_path("pdfinfo", poppler_path), pdf_path]
+
+        if userpw is not None:
+            command.extend(["-upw", userpw])
+
+        if rawdates:
+            command.extend(["-rawdates"])
+
+        # Add poppler path to LD_LIBRARY_PATH
+        env = os.environ.copy()
+        if poppler_path is not None:
+            env["LD_LIBRARY_PATH"] = poppler_path + ":" + env.get("LD_LIBRARY_PATH", "")
+        proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
+
+        try:
+            out, err = proc.communicate(timeout=timeout)
+        except TimeoutExpired:
+            proc.kill()
+            outs, errs = proc.communicate()
+            raise PDFPopplerTimeoutError("Run poppler poppler timeout.")
+
+        d = {}
+        for field in out.decode("utf8", "ignore").split("\n"):
+            sf = field.split(":")
+            key, value = sf[0], ":".join(sf[1:])
+            if key != "":
+                d[key] = (
+                    int(value.strip())
+                    if key in PDFINFO_CONVERT_TO_INT
+                    else value.strip()
+                )
+
+        if "Pages" not in d:
+            raise ValueError
+
+        return d
+
+    except OSError:
+        raise PDFInfoNotInstalledError(
+            "Unable to get page count. Is poppler installed and in PATH?"
+        )
+    except ValueError:
+        raise PDFPageCountError(
+            "Unable to get page count.\n%s" % err.decode("utf8", "ignore")
+        )
+
+
+def pdfinfo_from_bytes(
+    pdf_file, userpw=None, poppler_path=None, rawdates=False, timeout=None
+):
+    fh, temp_filename = tempfile.mkstemp()
+    try:
+        with open(temp_filename, "wb") as f:
+            f.write(pdf_file)
+            f.flush()
+        return pdfinfo_from_path(temp_filename, userpw=userpw, rawdates=rawdates,
+                                 poppler_path=poppler_path)
+    finally:
+        os.close(fh)
+        os.remove(temp_filename)
+
+
+def _load_from_output_folder(
+    output_folder, output_file, ext, paths_only, in_memory=False
+):
+    images = []
+    for f in sorted(os.listdir(output_folder)):
+        if f.startswith(output_file) and f.split(".")[-1] == ext:
+            if paths_only:
+                images.append(os.path.join(output_folder, f))
+            else:
+                images.append(Image.open(os.path.join(output_folder, f)))
+                if in_memory:
+                    images[-1].load()
+    return images
diff --git a/pdf_to_jpg_dir.py b/pdf_to_jpg_dir.py
index 42481cf..2dd6c22 100644
--- a/pdf_to_jpg_dir.py
+++ b/pdf_to_jpg_dir.py
@@ -1,10 +1,12 @@
 import argparse
 import math
 import os
-import re
 import shutil
+from posixpath import join
 
-from pdf2image import convert_from_path
+from tqdm import tqdm
+
+from pdf2image_with_logging import convert_from_path
 from google.cloud import storage
 
 parser = argparse.ArgumentParser()
@@ -17,22 +19,24 @@ def convert_to_jpg(args):
     print("Converting file {} to jpg".format(args.filepath))
     path_book_name = os.path.splitext(args.filepath)[0]
     book_name = os.path.split(path_book_name)[-1]
-    pages = convert_from_path(args.filepath, 500)   
 
-    n = int(math.ceil(math.log(len(pages)) / math.log(10)))
+    page_paths = convert_from_path(
+        args.filepath, 500, output_folder="tmp_pdf_to_jpg", fmt="jpg", paths_only=True
+    )
+
+    n = int(math.ceil(math.log(len(page_paths)) / math.log(10)))
 
     storage_client = storage.Client()
-    bucket = storage_client.get_bucket(bucket_name=args.bucket)
+    bucket = storage_client.get_bucket(args.bucket)
+
+    gcs_dir = args.output_dir if args.output_dir else book_name
 
-    for i, page in enumerate(pages):
+    for i, page_path in tqdm(enumerate(page_paths), desc="Uploading", total=len(page_paths)):
         number_str = "_{num:0{width}}.jpg".format(num=i+1, width=n)
-        output_name = os.path.join("tmp_pdf_to_jpg", book_name + number_str)
-        page.save(output_name, 'JPEG')
 
-        gcs_dir = args.output_dir if args.output_dir else book_name
-        blob = bucket.blob(os.path.join(gcs_dir, book_name + number_str))
-        print("Uploading file to Google Cloud bucket: {}".format(output_name))
-        blob.upload_from_filename(output_name)
+        gcs_path = join(gcs_dir, book_name + number_str)
+        blob = bucket.blob(gcs_path)
+        blob.upload_from_filename(page_path)
 
 
 if __name__ == "__main__":
@@ -40,6 +44,7 @@ def convert_to_jpg(args):
     os.mkdir("tmp_pdf_to_jpg")
     try:
         convert_to_jpg(args)
-    except:
-        pass
-    shutil.rmtree("tmp_pdf_to_jpg")
\ No newline at end of file
+    except Exception as e:
+        raise e
+    finally:
+        shutil.rmtree("tmp_pdf_to_jpg")
diff --git a/requirements.txt b/requirements.txt
index b0a6e7e..354ab22 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 google-cloud-vision
 google-cloud-storage
 tqdm
+pdf2image
diff --git a/transcribe.sh b/transcribe.sh
index 6acada2..7c59de8 100644
--- a/transcribe.sh
+++ b/transcribe.sh
@@ -1,39 +1,16 @@
 # Data from https://drive.google.com/drive/u/3/folders/14x2_3ZS-XguLldWWZW5uGBYViX8kJIPw
 
-# Make sure to use your own bucket, you will not have access to this one
-bucket_name="ocr-tutorial-20220901"
+# Install necessary requirements
+pip -q -q install -r requirements.txt --upgrade
 
+# We need to install poppler for converting pdfs to images
+conda install -c conda-forge poppler
 
-cloud_paths=(
-    "1-Jinpa-Thupten_2004_Theg-pa-chen-po-blo-sbyong-brgya-rtsa_LOTC-1 (1).pdf"
-    "2-Jinpa_2005_Bka-gdams-glegs-bam-las-btus-pa_i-chos-skor_LOTC-2.pdf"
-    "3-Jinpa_2005_rdzogs-pa-chen-po-sems-nyid-ngal-gso_i-_grel-pa-shing-rta-chen-po_LOTC-3.pdf"
-    "4-Jinpa_2004_dpal-sakya-pa_i-lam-_bras-kyi-chos-skor-gces-btus_LOTC-4.pdf"
-    "5-Jinpa_2008_mnyam-med-bka_-brgyud-lugs-kyi-phyag-rgya-chen-po-dang-_brel-ba_i-chos-skor_LOTC-5.pdf"
-    "6-Jinpa_2005_dpal-dge-ldan-pa_i-lam-rim-dang-snyan-brgyud-kyi-chos-skor_LOTC-6.pdf"
-    "7-Jinpa-JO-NANG-CHOS-SKOR-RI-CHOS-NGES-DON-RGYA-MTSO-LoTC-VOL-7.pdf"
-    "8-Jinpa-ZHI-BYED-DANG-GCOD-YUL-SOGS-GDAMS-NGAG-THOR-BU_I-CHOS-SKOR-LoTC.pdf"
-    "9-Jinpa-gyung-drung-bon-gyi-mdo-sngags-sems-gsum-gyi-gzhung.pdf"
-    "10-Jinpa-bstan-pa-la-_jug-pa_i-rim-pa-son-pa_i-gzhung-gces-btus.pdf"
-    "11-Jinpa-RGYAL-SRAS-KYI-SPYOD-PA-LA-_JUG-PA_I-CHOS-SKOR.pdf"
-    "13-Jinpa-RNAL-_BYOR-BLA-MED-KYI-SGRUB-THABS-DANG-BSKYED-RIM-GYI-KHRID-YIG-KHAG-LoTC-VOL-13.pdf"
-    "14-Jinpa-DUS-_KHOR-_GREL-CHEN-DRI-MED-_OD-KYI-RGYAN-LoTC-VOL-14.pdf"
-    "15-Jinpa-RDZOGS-RIM-RIM-LNGA-GSAL-SGRON-LoTC-VOL-15-Scan.pdf"
-    "17-Jinpa-2007_bde-gshegs-snying-po-rigs-kyi-chos-skor_Institute-for-Tibetan-Classics.pdf"
-    "19-Jinpa-DBU-MA-DGONGS-PA-RAB-GSAL-LoTC-VOL-19-Scan.pdf"
-    "20-Jinpa-DPAL-SA-SKYA-PA_I-TSAD-MA-RIG-PA_I-GZHUNG-GCES-BTUS-LoTC-VOL-20-Scan.pdf"
-    "21-Jinpa-DPAL-DGE-LDAN-PA_I-TSAD-MA-RIG-PA_I-GZHUNG-GCES-BTUS-LoTC-VOL-21-Scan.pdf"
-    "23-Jinpa-CHOS-MNGON-PA-MDZOD-KYI-_GREL-PA-MNGON-PA_I-RGYAN-LoTC-VOL-23-Scan.pdf"
-    "24-Jinpa-GRUB-MTHA-THUB-BSTAN-LHUN-PO_I-MDZES-RGYAN-LoTC-VOL-24-Scan.pdf"
-    "25-Jinpa-GRUB-MTHA-SHEL-GYI-ME-LONG-LoTC-VOL-25-Scan.pdf"
-    "27-Jinpa-LEGS-BSHAD-LUGS-KYI-BSLAB-BYA_I-DPE-TSOGS-LoTC-VOL-27-Scan.pdf"
-    "28-Jingpa-BDE-GSHEGS-SNYING-PO-RIGS-KYI-CHOS-SKOR-LoTC-VOL-28-Scan.pdf"
-    "29-Jinpa-RTSIS-KYI-MAN-NGAG-NYIN-BYED-SNANG-BA_I-RANAM-_GREL-GSER-GYI-SHING-RTA-LoTC-VOL-29-Scan.pdf"
-    "31-Jingpa-BOD-KYI-LHA-MO_I-KHRAB-GZHUNG-CHE-KHAG-GCES-BTUS-LoTC-VOL-31-Scan.pdf"
-    "32-Jinpa-MKHAS-PA-LDE_US-MDZAD-PA_I-RGYA-BOD-KYI-CHOS-_BYUNG-RGYAS-PA-LoTC-VOL-32-Scan.pdf"
-)
+# Make sure to use your own bucket, you will not have access to this one
+cloud_bucket_name="ocr-tutorial-20220901"
 
-for cloud_path in ${cloud_paths[*]}
+for local_path in data/*.pdf
 do
-    python transcribe_bo-fo.py --output-dir results --filepath "$cloud_path" --bucket "$bucket_name"
+    python pdf_to_jpg_dir.py --filepath "$local_path" --bucket $cloud_bucket_name
+    python transcribe_image_dir.py --filepath "${local_path#*/}" --bucket $cloud_bucket_name --output-dir results
 done
diff --git a/transcribe_image_dir.py b/transcribe_image_dir.py
index 416cc74..a0390c2 100644
--- a/transcribe_image_dir.py
+++ b/transcribe_image_dir.py
@@ -1,6 +1,8 @@
 import argparse
 import os
 
+from tqdm import trange
+
 from google.cloud import storage
 from google.cloud import vision
 
@@ -14,31 +16,47 @@ def detect_document_tibetan(args):
     """Detects text in all images in a folder located in Google Cloud Storage.
     """
     book_name = os.path.split(args.filepath)[-1]
-    
+    folder_name = os.path.splitext(book_name)[0]
+
     storage_client = storage.Client()
     bucket = storage_client.get_bucket(args.bucket)
-    blob_list = sorted([blob.name for blob in bucket.list_blobs(prefix=args.filepath+"/")])
+    blob_list = sorted([blob.name for blob in bucket.list_blobs(prefix=folder_name+"/")])
 
-    outputs = []
+    # Setup calls to transcription
+    client = vision.ImageAnnotatorClient()
+    image_context = vision.ImageContext(language_hints=["bo"])
+    feature = vision.Feature(
+        type=vision.Feature.Type.DOCUMENT_TEXT_DETECTION,
+        model="builtin/weekly")
 
-    for name in blob_list:
+    def create_annotate_image_request(name: str) -> vision.AnnotateImageRequest:
+        # Build Image URI in GCS
         gcs_uri = "gs://" + args.bucket + "/" + name
-        print("Running document text detection on: {}".format(gcs_uri))
-        client = vision.ImageAnnotatorClient()
-        image = vision.types.Image()
-        image_context = vision.types.ImageContext(language_hints=["bo"])
+        image = vision.Image()
         image.source.image_uri = gcs_uri
 
-        response = client.document_text_detection(image=image, image_context=image_context)
-        text = response.full_text_annotation.text
-        outputs.append(text)
+        # Create image annotation request
+        annotate_image_request = vision.AnnotateImageRequest(
+            image=image, image_context=image_context, features=[feature]
+        )
+
+        return annotate_image_request
+
+    annotate_image_requests = list(map(create_annotate_image_request, blob_list))
+
+    batch_size = 10
+    outputs = []
+    for i in trange(0, len(annotate_image_requests), batch_size, desc="Running document text detection"):
+        batch_annotate_image_requests = annotate_image_requests[i: i+batch_size]
+        response = client.batch_annotate_images(requests=batch_annotate_image_requests)
+        outputs.extend([r.full_text_annotation.text for r in response.responses])
 
     output_name = os.path.join(args.output_dir, book_name+".txt") if args.output_dir else book_name+".txt"
 
     print("Writing output file to: {}".format(output_name))
-    with open(output_name, "w") as f:
+    with open(output_name, "w", encoding="utf-8") as f:
         f.write("".join(outputs))
-        
+
 
 if __name__ == "__main__":
     args = parser.parse_args()