-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdoodle_2_art.py
416 lines (312 loc) · 14.5 KB
/
doodle_2_art.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
"""
"Semantic Style Transfer and Turning Two-Bit Doodles into Fine Artworks"
https://arxiv.org/pdf/1603.01768.pdf
"""
import skimage.io
import skimage.transform
import tensorflow as tf
import numpy as np
"""
Model Creation Functions
"""
def VGG19(img, vgg_path='vgg19.npy'):
"""
Description:
Creates a pre-trained VGG19 model without the Fully Connected Layers.
This allows the input image to have a variable size.
link to download vgg19.npy @ https://github.com/machrisaa/tensorflow-vgg
Args:
img: TF Tensor with shape [N,H,W,C]. Values are scaled to 0.0-1.0
vgg_path: path to vgg19.npy
Returns:
vgg: dictionary of layers in VGG19 model
"""
# Scale img to 0.0-255.0 and split into R, G, B channels
red, green, blue = tf.split(img * 255.0, num_or_size_splits=3, axis=3)
# Normalize the input image by subtracting mean
# Gather the channels in B, G, R order
vgg_mean = [103.939, 116.779, 123.68]
bgr = tf.concat(axis=3, values=[
blue - vgg_mean[0],
green - vgg_mean[1],
red - vgg_mean[2],
])
# Load pre-trained VGG19 parameters
vgg_params = np.load(vgg_path, encoding='latin1').item()
# Do not include FC layers to allow for variable sized images
vgg_layers = ["conv1_1", "conv1_2", "pool1",
"conv2_1", "conv2_2", "pool2",
"conv3_1", "conv3_2", "conv3_3", "conv3_4", "pool3",
"conv4_1", "conv4_2", "conv4_3", "conv4_4", "pool4",
"conv5_1", "conv5_2", "conv5_3", "conv5_4", "pool5"]
# Initialize dictionary to store the layers
vgg = {}
# Create VGG19 model and load pre-trained parameters
curr = bgr
for layer in vgg_layers:
if layer[:4] == "conv":
curr = tf.nn.conv2d(curr, filter=vgg_params[layer][0],
strides=[1, 1, 1, 1], padding='SAME')
curr += vgg_params[layer][1]
curr = tf.nn.relu(curr)
elif layer[:4] == "pool":
curr = tf.nn.max_pool(curr, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
vgg[layer] = curr
return vgg
def FeatureLayer(layer):
"""
Description:
Flattens 2D images into 1D vectors
Args:
layer: TF Tensor with shape [N,H,W,C] (expecting N=1 image)
Returns:
Flattened layer with shape [D,C] where:
D = NxHxW No. of Features in a Column Vector
C Number of Column Vectors
"""
shape = layer.get_shape().as_list()
return tf.reshape(layer, shape=[-1, shape[3]])
def StyleLayer(layer):
"""
Description:
Calculates a Gramm Matrix from Array of Column Vectors
(uses all Features in a Column Vector)
Args:
layer: TF Tensor with shape [D,C] (expecting FeatureLayer output)
Returns:
Gramm Matrix with shape [C,C]
"""
return tf.matmul(a=layer, b=layer, transpose_a=True)
def StyleLayerPatches(layer, patches):
"""
Description:
For each patch, extracts a subset of Features from Array of Column Vectors before
calculating a Gramm Matrix
Args:
layer: Tensorflow Tensor with shape [D,C] (expecting content layer)
patches: shape [M,P] where M is the number of patches, and P is the no. of pixels in patch.
Values should be in range 0..D-1, which are idxs of first dimension in layer
Returns:
Gramm Matrixs with shape [M,C,C]
"""
# idxs has shape [M,P]
idxs = tf.constant(patches, dtype=tf.int64)
# layer_patches has shape [M,P,C]
layer_patches = tf.gather(layer, indices=idxs)
# M may be quite large, so to avoid OOM issues, we do M separate Gramm Matrix
# calculations instead of one big row-wise calculation
layer_patches = tf.split(layer_patches, num_or_size_splits=1)
# merge M x [C,C] Gramm Matrixs to shape [M,C,C]
return tf.concat([StyleLayer(p) for p in layer_patches], axis=0)
"""
Helper Functions
"""
def NHWC(img):
"""
Args:
img: an image loaded using skimage with shape [H,W,C]
Returns:
img casted to float32 and reshaped to [N,H,W,C]
"""
return np.expand_dims(a=img, axis=0).astype(np.float32)
def GeneratePatchIdxs(img_shape, stride=3, patch_size=3):
"""
Description:
Generates idxs which can be used to extract patch_size x patch_size patches
from a flattened image. Note that image is not padded
e.g. image [[0 1 2 3 4] flattened image [0 1 2 3 4 5 6 7 8 9]
[5 6 7 8 9]]
2x2 patches with stride of 2 results in idxs [[0,1,5,6],[2,3,7,8]]
Args:
img_shape: array of 3 values [H,W,C]
stride: no. of pixels between each patch. (stride=patch_size for non overlapping)
patch_size: length of side of square patch
Returns:
idxs corresponding to patches in flattened image
has shape [N,P] where N = number of non-overlapping patches, P = patch_size x patch_size
"""
# determine how many patches we can fit along the image height and width
# number of patches = rows*cols
rows = int(np.ceil((img_shape[0] - patch_size + 1) / stride))
cols = int(np.ceil((img_shape[1] - patch_size + 1) / stride))
# create a 2D array same size as the image with cells containing a value
# corresponding to their idx when flattened
idxs = np.arange(0, img_shape[0] * img_shape[1])
idxs = np.reshape(idxs, newshape=[img_shape[0], img_shape[1]])
# loop through every patch: extract the idxs, and flatten to 1D vector
# patch_idxs has shape [N,P] where N = rows x cols, and P = patch_size x patch_size
patch_idxs = [idxs[i*stride:i*stride+patch_size, j*stride:j*stride+patch_size].reshape([-1])
for i in range(rows)
for j in range(cols)]
# convert to numpy array
patch_idxs = np.array(patch_idxs, dtype=np.int64)
return patch_idxs, rows, cols
def PatchMatching(content_map, style_map, patch_size=3):
"""
Description:
For every patch in content_map, find Nearest Neighbour patch in style_map.
Similarity measure is cosine similarity
e.g. content_map [[0 1 2 3 4] style_map [[0 1 2 3]
[5 6 7 8 9]] [4 5 6 7]]
assume patch 1 in content_map is most similar to patch 2 in style_map
and patch 2 in content_map is most similar to patch 1 in style_map
returns [[0,1,5,6],[2,3,7,8]] [[2,3,6,7],[0,1,4,5]]
Args:
content_map: an image with shape [H1,W1,C]
style_map: an image with shape [H2,W2,C]
patch_size: length of a side of the square patch
Returns:
a tuple of 2 arrays, both with shapes [N1,P] where:
N1 = number of patches in content_map
P = patch size x patch size
array1 = idxs of every patch in content_map
array2 = idxs of Nearest Neighbour patch in style_map corresponding to the
patch in array1 of the same index
Note that patches in style_map may appear multiple times or not at all
"""
# flatten the maps from [H,W,C] to [HxW,C]
content_map_flat = np.reshape(content_map, [-1, content_map.shape[2]])
style_map_flat = np.reshape(style_map, [-1, style_map.shape[2]])
# generate the idxs to extract the patches from the maps
# c_patch_idxs has shape [N1,P] where N1 = c_rows x c_cols, and P = patch_size x patch_size
# s_patch_idxs has shape [N2,P] where N2 = s_rows x s_cols, and P = patch_size x patch_size
c_patch_idxs, c_rows, c_cols = GeneratePatchIdxs(content_map.shape, stride=patch_size, patch_size=patch_size)
s_patch_idxs, s_rows, s_cols = GeneratePatchIdxs(style_map.shape, stride=1, patch_size=patch_size)
# extract the patches from the maps, and further flatten the channels for each patch
# c_patches has shape [N1,PxC]
# s_patches has shape [N2,PxC]
c_patches = content_map_flat[c_patch_idxs,:]
c_patches = np.reshape(c_patches, [c_rows*c_cols, -1])
s_patches = style_map_flat[s_patch_idxs,:]
s_patches = np.reshape(s_patches, [s_rows*s_cols, -1])
# calculate the euclidian norm for each patch
# c_patches_norm has shape [N1,1]
# s_patches_norm has shape [N2,1]
c_patches_norm = np.linalg.norm(c_patches, axis=1, keepdims=True)
s_patches_norm = np.linalg.norm(s_patches, axis=1, keepdims=True)
# normalize each patch, making it a unit vector
c_patches = np.divide(c_patches, c_patches_norm)
s_patches = np.divide(s_patches, s_patches_norm)
# calculate the cosine similarity between every patch in c_patches with
# every patch in s_patches
# similarity has shape [N1,N2]
similarity = np.dot(c_patches, np.transpose(s_patches))
# every time a patch in s_patches is chosen we discount it to reduce
# the chance of it being picked again
discount = np.zeros(s_rows*s_cols, dtype=np.float32)
# an array to store the Nearest Neighbour patches
s_nn_patch_idxs = np.zeros_like(c_patch_idxs, dtype=np.int64)
# loop through every c_patch
for i in range(c_rows*c_cols):
max_similarity = np.max(similarity[i,:])
# find the Nearest Neighbour patch
nn = np.random.choice(np.where(similarity[i,:] > max_similarity - 0.0002)[0])
# reduce the chance of that patch being picked again
discount[nn] += 0.002
s_nn_patch_idxs[i,:] = s_patch_idxs[nn,:]
return (c_patch_idxs, s_nn_patch_idxs)
"""
Parameters
"""
# Define layers of VGG19 model to use for Style
style_layers = ["conv2_1","conv3_1"]
# Load images & scale values from 0.0-255.0 to 0.0-1.0
style_img = skimage.io.imread("images\\Monet.jpg") / 255.0
# Load semantic map images
content_map = skimage.io.imread("images\\Coastline_sem.png") / 255.0
style_map = skimage.io.imread("images\\Monet_sem.png") / 255.0
# Length of side of Square Patch
patch_size = 12
"""
Nearest Neighbour Patch Matching
"""
print("Matching Patches in Semantic Maps")
# Dictionary to store the patch idxs
content_map_idxs = {}
style_map_idxs = {}
# unique no. of times layers in style layers have been pooled
# e.g. both conv2_1 and conv2_2 have been pooled only once
pool_layers = set([int(layer[4]) for layer in style_layers])
content_map_shape = np.array(content_map.shape)
style_map_shape = np.array(style_map.shape)
for pool in range(1,6):
if pool in pool_layers:
# Divides the content map into non overlapping patches and for each
# patch find the most similar patch in style map
# content_map_idxs[pool] has shape [N,P] where N is number of patches, P is patch area
# style_map_idxs[pool] has shape [N,P] where N is number of patches, P is patch area
# style_map_idxs[pool][i] is the most similar patch to content_map_idxs[pool][i]
content_map_idxs[pool], style_map_idxs[pool] = PatchMatching(
skimage.transform.resize(content_map, content_map_shape),
skimage.transform.resize(style_map, style_map_shape),
patch_size)
# resize the images the same way as VGG19's max pool layers
content_map_shape[0:2] = (content_map_shape[0:2] + 1) // 2
style_map_shape[0:2] = (style_map_shape[0:2] + 1) // 2
"""
Evaluate Style Image
"""
print("Building Model to Evaluate Style Image")
tf.reset_default_graph()
# Build a Model with variable sized image
img = tf.placeholder(tf.float32, shape=[1,None,None,3])
vgg = VGG19(tf.Variable(img, dtype=tf.float32, validate_shape=False))
# for every layer in style layers, create a style layer
for layer in style_layers:
pool = int(layer[4])
vgg["feat_" + layer] = FeatureLayer(vgg[layer])
vgg["style_" + layer] = StyleLayerPatches(vgg["feat_" + layer], style_map_idxs[pool])
# Dictionaries to store Style Layer outputs
style = {}
with tf.Session() as sess:
print("Evaluating Style Image")
# initialize img as Style Image
sess.run(tf.global_variables_initializer(), feed_dict={img: NHWC(style_img)})
for layer in style_layers:
name = "style_" + layer
style[name] = vgg[name].eval()
"""
Style Transfer Model
"""
print("Building Model to Transfer Style")
tf.reset_default_graph()
# Build a Model with Content Map (alternatively can use random noise)
# We use Variable instead of placeholder as we wish to make use of Tensorflow's optimizers
img = tf.Variable(NHWC(content_map))
vgg = VGG19(img)
# for every layer in style layers, create a style layer
for layer in style_layers:
pool = int(layer[4])
vgg["feat_" + layer] = FeatureLayer(vgg[layer])
vgg["style_" + layer] = StyleLayerPatches(vgg["feat_" + layer], content_map_idxs[pool])
"""
Loss Function
"""
style_loss = 0
for layer in style_layers:
name = "style_" + layer
style_loss += tf.reduce_mean(tf.square(style[name] - vgg[name]))
style_loss /= len(style_layers) # each style layer has equal weight
# Total variation loss (encourages adjacent pixels to be similar color)
tv_loss = tf.reduce_mean(((img[:,1:,:-1,:] - img[:,:-1,:-1,:])**2 + (img[:,:-1,1:,:] - img[:,:-1,:-1,:])**2)**1.25)
loss = 25*style_loss + 1*tv_loss
"""
Backprop
"""
print("Stylising")
# Backpropagate error to img using scipy's L-BFGS optimizer
train_step =tf.contrib.opt.ScipyOptimizerInterface(loss,
var_list=[img],
options={"maxiter":200, "disp" : True})
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_step.minimize(sess,
fetches=[loss],
loss_callback=lambda x : print("loss: %f"%x))
# img is TF Tensor with shape [N,H,W,C]. Discard the first dimension
stylised_img = img.eval()[0]
# Clip the values to 0.0-1.0
stylised_img = np.clip(stylised_img, 0, 1)
# Show img
skimage.io.imshow(stylised_img)