|
36 | 36 | {
|
37 | 37 | "cell_type": "code",
|
38 | 38 | "execution_count": null,
|
39 |
| - "outputs": [], |
40 |
| - "source": [ |
41 |
| - "katib_python_sdk = \"kubeflow-katib\"" |
42 |
| - ], |
| 39 | + "id": "1fedadf1", |
43 | 40 | "metadata": {
|
44 | 41 | "collapsed": false,
|
| 42 | + "editable": true, |
| 43 | + "jupyter": { |
| 44 | + "outputs_hidden": false |
| 45 | + }, |
45 | 46 | "pycharm": {
|
46 | 47 | "name": "#%%\n"
|
47 |
| - } |
48 |
| - } |
49 |
| - }, |
50 |
| - { |
51 |
| - "cell_type": "code", |
52 |
| - "execution_count": null, |
| 48 | + }, |
| 49 | + "slideshow": { |
| 50 | + "slide_type": "" |
| 51 | + }, |
| 52 | + "tags": [ |
| 53 | + "parameters" |
| 54 | + ] |
| 55 | + }, |
53 | 56 | "outputs": [],
|
54 | 57 | "source": [
|
55 |
| - "!pip install {katib_python_sdk}\n", |
56 |
| - "!pip install tensorflow\n", |
57 |
| - "!pip install kubeflow-training" |
58 |
| - ], |
59 |
| - "metadata": { |
60 |
| - "collapsed": false, |
61 |
| - "pycharm": { |
62 |
| - "name": "#%%\n" |
63 |
| - } |
64 |
| - } |
| 58 | + "katib_python_sdk = \"kubeflow-katib\"\n", |
| 59 | + "namespace = \"default\" " |
| 60 | + ] |
65 | 61 | },
|
66 | 62 | {
|
67 | 63 | "cell_type": "code",
|
68 | 64 | "execution_count": null,
|
69 |
| - "id": "5de885ca-e96a-4d59-9e78-75f6fc6f5ce7", |
| 65 | + "id": "518cfe2d", |
70 | 66 | "metadata": {
|
71 |
| - "editable": true, |
| 67 | + "collapsed": false, |
| 68 | + "jupyter": { |
| 69 | + "outputs_hidden": false |
| 70 | + }, |
72 | 71 | "pycharm": {
|
73 | 72 | "name": "#%%\n"
|
74 |
| - }, |
75 |
| - "slideshow": { |
76 |
| - "slide_type": "" |
77 |
| - }, |
78 |
| - "tags": [] |
| 73 | + } |
79 | 74 | },
|
80 | 75 | "outputs": [],
|
81 | 76 | "source": [
|
|
87 | 82 | {
|
88 | 83 | "cell_type": "code",
|
89 | 84 | "execution_count": null,
|
90 |
| - "id": "15ea90f2-ad99-4bf4-ae07-cc35bcbe6884", |
| 85 | + "id": "7211d9e4", |
91 | 86 | "metadata": {
|
| 87 | + "collapsed": false, |
92 | 88 | "editable": true,
|
| 89 | + "jupyter": { |
| 90 | + "outputs_hidden": false |
| 91 | + }, |
93 | 92 | "pycharm": {
|
94 | 93 | "name": "#%%\n"
|
95 | 94 | },
|
96 | 95 | "slideshow": {
|
97 | 96 | "slide_type": ""
|
98 | 97 | },
|
99 |
| - "tags": [ |
100 |
| - "parameters" |
101 |
| - ] |
| 98 | + "tags": [] |
102 | 99 | },
|
103 | 100 | "outputs": [],
|
104 | 101 | "source": [
|
105 |
| - "# Experiment namespace\n", |
106 |
| - "namespace = \"default\" " |
| 102 | + "!pip install {katib_python_sdk}\n", |
| 103 | + "!pip install tensorflow\n", |
| 104 | + "!pip install kubeflow-training" |
107 | 105 | ]
|
108 | 106 | },
|
109 | 107 | {
|
110 | 108 | "cell_type": "markdown",
|
111 |
| - "id": "881aae2f-d08e-4439-bef9-1684ff87556d", |
112 |
| - "metadata": { |
113 |
| - "pycharm": { |
114 |
| - "name": "#%% md\n" |
115 |
| - }, |
116 |
| - "tags": [] |
117 |
| - }, |
118 | 109 | "source": [
|
119 | 110 | "## Create Train Script for CNN Model\n",
|
120 | 111 | "\n",
|
121 | 112 | "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). "
|
122 |
| - ] |
123 |
| - }, |
124 |
| - { |
125 |
| - "cell_type": "code", |
126 |
| - "execution_count": 1, |
127 |
| - "id": "98a77484-cee4-4d77-bd76-915c469a242b", |
| 113 | + ], |
128 | 114 | "metadata": {
|
129 |
| - "editable": true, |
130 |
| - "execution": { |
131 |
| - "iopub.execute_input": "2022-09-12T18:46:59.051290Z", |
132 |
| - "iopub.status.busy": "2022-09-12T18:46:59.050544Z", |
133 |
| - "iopub.status.idle": "2022-09-12T18:46:59.249456Z", |
134 |
| - "shell.execute_reply": "2022-09-12T18:46:59.248292Z", |
135 |
| - "shell.execute_reply.started": "2022-09-12T18:46:59.051211Z" |
136 |
| - }, |
| 115 | + "collapsed": false, |
137 | 116 | "pycharm": {
|
138 |
| - "name": "#%%\n" |
139 |
| - }, |
140 |
| - "slideshow": { |
141 |
| - "slide_type": "" |
142 |
| - }, |
143 |
| - "tags": [] |
144 |
| - }, |
145 |
| - "outputs": [], |
146 |
| - "source": [ |
147 |
| - "def train_mnist_model(parameters):\n", |
148 |
| - " import tensorflow as tf\n", |
149 |
| - " import numpy as np\n", |
150 |
| - " import logging\n", |
151 |
| - "\n", |
152 |
| - " logging.basicConfig(\n", |
153 |
| - " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n", |
154 |
| - " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n", |
155 |
| - " level=logging.INFO,\n", |
156 |
| - " )\n", |
157 |
| - " logging.info(\"--------------------------------------------------------------------------------------\")\n", |
158 |
| - " logging.info(f\"Input Parameters: {parameters}\")\n", |
159 |
| - " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", |
160 |
| - "\n", |
161 |
| - "\n", |
162 |
| - " # Get HyperParameters from the input params dict.\n", |
163 |
| - " lr = float(parameters[\"lr\"])\n", |
164 |
| - " num_epoch = int(parameters[\"num_epoch\"])\n", |
165 |
| - "\n", |
166 |
| - " # Set dist parameters and strategy.\n", |
167 |
| - " is_dist = parameters[\"is_dist\"]\n", |
168 |
| - " num_workers = parameters[\"num_workers\"]\n", |
169 |
| - " batch_size_per_worker = 64\n", |
170 |
| - " batch_size_global = batch_size_per_worker * num_workers\n", |
171 |
| - " strategy = tf.distribute.MultiWorkerMirroredStrategy(\n", |
172 |
| - " communication_options=tf.distribute.experimental.CommunicationOptions(\n", |
173 |
| - " implementation=tf.distribute.experimental.CollectiveCommunication.RING\n", |
174 |
| - " )\n", |
175 |
| - " )\n", |
176 |
| - "\n", |
177 |
| - " # Callback class for logging training.\n", |
178 |
| - " # Katib parses metrics in this format: <metric-name>=<metric-value>.\n", |
179 |
| - " class CustomCallback(tf.keras.callbacks.Callback):\n", |
180 |
| - " def on_epoch_end(self, epoch, logs=None):\n", |
181 |
| - " logging.info(\n", |
182 |
| - " \"Epoch {}/{}. accuracy={:.4f} - loss={:.4f}\".format(\n", |
183 |
| - " epoch+1, num_epoch, logs[\"accuracy\"], logs[\"loss\"]\n", |
184 |
| - " )\n", |
185 |
| - " )\n", |
186 |
| - "\n", |
187 |
| - " # Prepare MNIST Dataset.\n", |
188 |
| - " def mnist_dataset(batch_size):\n", |
189 |
| - " (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()\n", |
190 |
| - " x_train = x_train / np.float32(255)\n", |
191 |
| - " y_train = y_train.astype(np.int64)\n", |
192 |
| - " train_dataset = (\n", |
193 |
| - " tf.data.Dataset.from_tensor_slices((x_train, y_train))\n", |
194 |
| - " .shuffle(60000)\n", |
195 |
| - " .repeat()\n", |
196 |
| - " .batch(batch_size)\n", |
197 |
| - " )\n", |
198 |
| - " return train_dataset\n", |
199 |
| - "\n", |
200 |
| - " # Build and compile CNN Model.\n", |
201 |
| - " def build_and_compile_cnn_model():\n", |
202 |
| - " model = tf.keras.Sequential(\n", |
203 |
| - " [\n", |
204 |
| - " tf.keras.layers.InputLayer(input_shape=(28, 28)),\n", |
205 |
| - " tf.keras.layers.Reshape(target_shape=(28, 28, 1)),\n", |
206 |
| - " tf.keras.layers.Conv2D(32, 3, activation=\"relu\"),\n", |
207 |
| - " tf.keras.layers.Flatten(),\n", |
208 |
| - " tf.keras.layers.Dense(128, activation=\"relu\"),\n", |
209 |
| - " tf.keras.layers.Dense(10),\n", |
210 |
| - " ]\n", |
211 |
| - " )\n", |
212 |
| - " model.compile(\n", |
213 |
| - " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", |
214 |
| - " optimizer=tf.keras.optimizers.SGD(learning_rate=lr),\n", |
215 |
| - " metrics=[\"accuracy\"],\n", |
216 |
| - " )\n", |
217 |
| - " return model\n", |
218 |
| - " \n", |
219 |
| - " # Download Dataset.\n", |
220 |
| - " dataset = mnist_dataset(batch_size_global)\n", |
221 |
| - "\n", |
222 |
| - " # For dist strategy we should build model under scope().\n", |
223 |
| - " if is_dist:\n", |
224 |
| - " logging.info(\"Running Distributed Training\")\n", |
225 |
| - " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", |
226 |
| - " with strategy.scope():\n", |
227 |
| - " model = build_and_compile_cnn_model()\n", |
228 |
| - " else:\n", |
229 |
| - " logging.info(\"Running Single Worker Training\")\n", |
230 |
| - " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", |
231 |
| - " model = build_and_compile_cnn_model()\n", |
232 |
| - " \n", |
233 |
| - " # Start Training.\n", |
234 |
| - " model.fit(\n", |
235 |
| - " dataset,\n", |
236 |
| - " epochs=num_epoch,\n", |
237 |
| - " steps_per_epoch=70,\n", |
238 |
| - " callbacks=[CustomCallback()],\n", |
239 |
| - " verbose=0,\n", |
240 |
| - " )" |
241 |
| - ] |
| 117 | + "name": "#%% md\n" |
| 118 | + } |
| 119 | + } |
242 | 120 | },
|
243 | 121 | {
|
244 | 122 | "cell_type": "markdown",
|
|
0 commit comments