-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata-processing.py
106 lines (96 loc) · 2.84 KB
/
data-processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: UTF-8 -*-
import os
import csv
import codecs
import random
import string
import multiprocessing
from tqdm import tqdm
from PIL import Image
failed_f = 'failed.lst'
valid_f = 'data.lst'
label_f = 'label_num.lst'
map_f = 'label_map.lst'
save_dir = '../data'
data_dir = '../garbage'
def check_pic(path, random_id):
basename = os.path.basename(path)
try:
pil_image = Image.open(path)
except:
print ("Warning: Failed to parse image{}".format(basename))
f = open(failed_f,'a')
f.write(path+'\n')
f.close()
return False
try:
pil_image_rgb = pil_image.convert('RGB')
except:
print ("Warning: Failed to convert image {} to RGB".format(basename))
f = open(failed_f,'a')
f.write(path+'\n')
f.close()
return False
try:
pil_image_rgb.save(os.path.join(save_dir, random_id+'.jpg'), format='JPEG', quality=100)
except:
print ("Warning: Failed to save image {}".format(path))
f = open(failed_f,'a')
f.write(path+'\n')
f.close()
return False
return True
def gen_random_id():
_id = ''.join(random.sample(string.ascii_letters + string.digits, 8))
return _id
def get_image_list():
f = open(valid_f,'w')
for root, dirs, files in os.walk(data_dir):
for _file in tqdm(files):
path = os.path.join(root,_file)
random_id = gen_random_id()
valid_pic = check_pic(path, random_id)
if valid_pic:
# write label
line = random_id + ':'
line += path + ':'
labels = path.split('/')[2:-1]
for label in labels:
line += label + ','
f.write(line+'\n')
f.close()
def get_image_dict():
f = open(valid_f,'r',encoding='utf-8')
lf = open(label_f,'w', encoding='utf-8')
mf = open(map_f,'w',encoding='utf-8')
all_labels = []
label_dict = {}
for line in f.readlines():
item = line.strip().split(':')
id = item[0]
labels = item[2].split(',')[:-1]
label_dict[id] = labels
for l in labels:
all_labels.append(l)
f.close()
unique_label = list(set(all_labels))
print ("Total unique label = {}".format(len(unique_label)))
_dict = {}
for i in range(len(unique_label)):
_dict[unique_label[i]] = i
mf.write(unique_label[i] + ' ' + str(i) + '\n')
mf.close()
for key in label_dict.keys():
#onehot = np.zeros(len(unique_label))
labels = label_dict[key]
#for l in labels:
# onehot[_dict[l]] = 1
line = key + ' '
for l in labels:
line += str(_dict[l]) + ','
line += '\n'
lf.write(line)
lf.close()
if __name__ == '__main__':
get_image_list()
get_image_dict()