diff --git a/01 Wetterskip_water_land V03 (180418).ipynb b/01 Wetterskip_water_land V03 (180418).ipynb new file mode 100644 index 0000000..129c248 --- /dev/null +++ b/01 Wetterskip_water_land V03 (180418).ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Wetterskip Fryslan\n", + "### Onderscheid water van niet-water obv RGB image en Class image\n", + "\n", + "#### Idee\n", + "- gebruik een sliding window met variabel formaat\n", + "- zet class image (zwart wit foto) om naar een vector. Het resultaat is dus 1 lange rij met eenen (water) en nullen (geen water).\n", + "- zet feature image (kleuren foto) om naar een matrix met evenveel rijen als de class vector. Deze feature matrix bevat uiteindelijk in elke rij, per layer (RGB) het aantal pixels van het sliding window. Dus stel het window is 2 bij 2 en de RGB foto heeft 3 lagen dan heeft elke rij in de matrix 2 maal 2 maal 3 = 12 kolommen. \n", + "\n", + "#### Dit script\n", + "- Data preparatie: \n", + " - lees class en feature images in array\n", + " - Met sliding windows techniek om feature en class maken\n", + "- Splitsen data in een train en testset (validatie wordt in ander script gedaan)\n", + "- Train model op :\n", + " - Naive Bayes\n", + " - Logistic Regression\n", + " - Random Forrest\n", + "\n", + "#### Todo\n", + "- Labelling afmaken\n", + " - Basis opzet is klaar\n", + " - class '2' verwijderen uit test en train set. Dit betreft categorie 'onbekend'\n", + " - 180418 OD: Categorie 'onbekend' verwijderd. Indien threshold waarde voor water niet wordt gehaald, dan 'niet water'=0\n", + "- Koppelen aan werkelijke (RGB) foto \n", + " - Basis opzet is klaar\n", + " - 180418 OD: originele tiff sample bestanden kunnen niet worden ingelezen met PIL omdat deze een afwijkende encoding hebben. Daarom deze met image editor omgezet naar .png.\n", + "- Test/train set maken\n", + " - Basis opzet is klaar\n", + " - 180418 OD: nu nog nieuwe foto als validatie set gebruiken \n", + "- Model kiezen \n", + " - Eerste test met logistic regression gedaan \n", + " - Naive base classificatie als baseline ingevoegd) \n", + " - RandomForrestClassifier geeft betere score op testset. \n", + " - Na iedere fit, met pickle het getrainde model opslaan voor nieuwe trainingsdoeleinden. \n", + "- Parameters tunen\n", + "\n", + "#### Opmerkingen\n", + "- Paul: ik heb nog wat ruzie met het openen van de tif bestanden. Om die reden omgezet naar jpg. Bij voorkeur wordt dit nog aangepast. \n", + "- 180418 OD: jpg geen favoriet formaat ivm lossy compression. PNG is een goed alternatief voor TIF" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bibliotheken laden" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pickle\n", + "from PIL import Image\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.naive_bayes import GaussianNB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Open Class Bestand" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# open class bestand in ZWART-WIT (convert('l'))\n", + "ClassImg = Image.open(\"labels.tif\").convert('L')\n", + "# zet om naar array (matrix)\n", + "ClassArray = np.array(ClassImg)\n", + "# zet breedte en hoogte om in variabelen\n", + "ClassImageWidth, ClassImageHeight = ClassArray.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[255, 255, 255, ..., 255, 255, 255],\n", + " [255, 255, 255, ..., 255, 255, 255],\n", + " [255, 255, 255, ..., 255, 255, 255],\n", + " ...,\n", + " [ 0, 0, 0, ..., 255, 255, 255],\n", + " [ 0, 0, 0, ..., 255, 255, 255],\n", + " [ 0, 0, 0, ..., 255, 255, 255]], dtype=uint8)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ClassArray" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# vervang alle waarden van 255 (wit) door 1 en alle andere door 0 (zwart)\n", + "ClassArray[ClassArray < 255] = 1 # zwart/water\n", + "ClassArray[ClassArray == 255] = 0 # wit/geen water" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# grootte van het sliding window\n", + "WindowLength = 5\n", + "WindowSize = WindowLength**2\n", + "# variable ter bepaling van de hoeveelheid pixels naar rechts en naar beneden (de 'slide')\n", + "Step = 1 \n", + "# tijdelijke teller\n", + "Counter = 0 \n", + "# drempel waarboven een window als water wordt geclassificeerd\n", + "Threshold = .8 \n", + "\n", + "# bepaal hoe groot de output vector wordt en creeer een lege array:\n", + "ClassArrayResult = np.zeros(shape=(((ClassImageWidth - WindowLength) * (ClassImageHeight - WindowLength))))\n", + "\n", + "# voor elke rij\n", + "for i in range(0,ClassImageWidth-WindowLength,Step):\n", + " # voor elke kolom\n", + " for j in range(0, ClassImageHeight-WindowLength,Step):\n", + " # wanneer de som van de waarden in het window gelijk is aan het totale aantal pixels in het window dan\n", + " # zitten er dus alleen maar 1'en in --> dus water\n", + " if (ClassArray[i:i + WindowLength, j:j + WindowLength]==1).sum() == (WindowSize):\n", + " ClassArrayResult[Counter] = 1\n", + " # wanneer de som van de waarden in het window gelijk is aan nul dan zitten er dus geen 1'en in --> dus geen water\n", + " elif (ClassArray[i:i + WindowLength, j:j + WindowLength]==1).sum() == 0:\n", + " ClassArrayResult[Counter] = 0\n", + " # wanneer het aantal 1'en gedeeld door het aantal pixels groter is dan de toegestane drempel, dan alsnog \n", + " # classificeren als 'water'\n", + " elif ((ClassArray[i:i + WindowLength, j:j + WindowLength]==1).sum() / (WindowSize)) > Threshold:\n", + " ClassArrayResult[Counter] = 1\n", + " # in de resterende gevallen is het onbekend\n", + " else:\n", + " ClassArrayResult[Counter] = 0\n", + " \n", + " Counter += 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Openen Feature foto (RGB)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# open feature foto (RGB).\n", + "FeatureImg = Image.open(\"feature1.png\").convert('RGB')\n", + "# zet om naar array (matrix)\n", + "FeatureArray = np.array(FeatureImg)\n", + "# zet breedte, hoogte en diepte om in variabelen\n", + "FeatureImageWidth, FeatureImageHeight, FeatureImageLayers = FeatureArray.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# bepaal het formaat van de output matrix en creeer een lege array:\n", + "FeatureArrayResult = np.zeros(shape=(((FeatureImageWidth - WindowLength) * (FeatureImageHeight - WindowLength)), (WindowSize)*FeatureImageLayers))\n", + "\n", + "# voor elke layer\n", + "for z in range(0, FeatureImageLayers):\n", + " Counter = 0\n", + " # voor elke rij\n", + " for i in range(0, FeatureImageWidth-WindowLength, Step):\n", + " # voor elke kolom\n", + " for j in range(0, FeatureImageHeight-WindowLength, Step):\n", + " # zet de betreffende waarden per layer in feature matrix\n", + " FeatureArrayResult[Counter, (z*(WindowSize)):(z*(WindowSize)+(WindowSize))] = (FeatureArray[i:i + WindowLength, j:j + WindowLength, z]).flatten() \n", + " Counter += 1\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Splitsen data in Train- en Testset" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# maak train en test set\n", + "X_train, X_test, y_train, y_test = train_test_split(FeatureArrayResult, ClassArrayResult, random_state=0) \n", + "# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Naive Bayes (baseline)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train model met Naive Bayes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# train een model op basis van Gaussian Naive Bayes\n", + "gnb = GaussianNB()\n", + "gnb_clf = gnb.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# save het getrainde Naive Bayes model naar schijf\n", + "filename = 'GaussianNaiveBayes_model.sav'\n", + "pickle.dump(gnb_clf, open(filename, 'wb'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Voorspel met Gaussian Naive Bayes" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.94 0.87 0.90 183679\n", + " 1.0 0.69 0.83 0.75 63828\n", + "\n", + "avg / total 0.87 0.86 0.86 247507\n", + "\n" + ] + } + ], + "source": [ + "# bekijk resultaten\n", + "gnb_predictions = gnb_clf.predict(X_test)\n", + "print(classification_report(y_test, gnb_predictions))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Logistic Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train Logistic Regression model" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", + " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# train model (Logistic regression in dit geval, eerst zonder parameter tuning)\n", + "classifier = LogisticRegression() \n", + "classifier.fit(X_train,y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# save het getrainde LogisticRegression model naar schijf\n", + "filename = 'Logistic_Regression_model.sav'\n", + "pickle.dump(classifier, open(filename, 'wb'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Voorspelling Logistic Regression model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.98 0.98 0.98 183679\n", + " 1.0 0.94 0.94 0.94 63828\n", + "\n", + "avg / total 0.97 0.97 0.97 247507\n", + "\n" + ] + } + ], + "source": [ + "# bekijk resultaten\n", + "predictions = classifier.predict(X_test)\n", + "print(classification_report(y_test, predictions))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random Forrest" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train Random Forrest model" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", + " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", + " oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# train model RandomForrest \n", + "rf_clf = RandomForestClassifier()\n", + "rf_clf.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "filename = 'RandomForrestClassifier_model.sav'\n", + "pickle.dump(rf_clf, open(filename, 'wb'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Voorspel uitkomt op basis van Random Forrest" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0.0 0.99 1.00 0.99 183679\n", + " 1.0 0.99 0.97 0.98 63828\n", + "\n", + "avg / total 0.99 0.99 0.99 247507\n", + "\n" + ] + } + ], + "source": [ + "rf_pred = rf_clf.predict(X_test)\n", + "print(classification_report(y_test, rf_pred))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}