forked from guoyww/AnimateDiff
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_dataset_json.py
42 lines (34 loc) · 1.07 KB
/
make_dataset_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import json
import os
from tqdm import tqdm
import re
files = ['1.png', '12.png', '23.png', '102.png', '2.png']
def extract_number(filename):
# Use regular expressions to extract the numeric part of the filename
match = re.search(r'\d+', filename)
if match:
return int(match.group())
return 0 # Return 0 if no numeric part is found
images_path = 'data/Zverev/'
decription_path = 'data/Zverev/description.txt'
jsonl_file_path = 'data/Zverev/metadata.jsonl'
images = [f for f in os.listdir(images_path) if f.endswith('.jpg')]
# Sort the list of files based on the extracted numbers
images = sorted(images, key=extract_number)
print(images)
with open(decription_path, 'r') as file:
lines = file.readlines()
print(len(images),
len(lines))
assert len(images) == len(lines)
data = []
for img, desc in tqdm(zip(images, lines)):
d = {
'file_name': img,
'text': desc
}
data.append(d)
with open(jsonl_file_path, "w") as jsonl_file:
for item in data:
json_line = json.dumps(item)
jsonl_file.write(json_line + '\n')