make_dataset_json.py

import json
import os
from tqdm import tqdm
import re

files = ['1.png', '12.png', '23.png', '102.png', '2.png']

def extract_number(filename):
    # Use regular expressions to extract the numeric part of the filename
    match = re.search(r'\d+', filename)
    if match:
        return int(match.group())
    return 0  # Return 0 if no numeric part is found

images_path = 'data/Zverev/'
decription_path = 'data/Zverev/description.txt'
jsonl_file_path = 'data/Zverev/metadata.jsonl'

images = [f for f in os.listdir(images_path) if f.endswith('.jpg')]
# Sort the list of files based on the extracted numbers
images = sorted(images, key=extract_number)
print(images)

with open(decription_path, 'r') as file:
    lines = file.readlines()

print(len(images),
      len(lines))
assert len(images) == len(lines)

data = []
for img, desc in tqdm(zip(images, lines)):
    d = {
        'file_name': img,
        'text': desc
        }
    data.append(d)

with open(jsonl_file_path, "w") as jsonl_file:
    for item in data:
        json_line = json.dumps(item)
        jsonl_file.write(json_line + '\n')