diff --git a/neuralredis.c b/neuralredis.c
index bfabd09..18dbf13 100644
--- a/neuralredis.c
+++ b/neuralredis.c
@@ -64,7 +64,7 @@ uint64_t NRNextId = 1; /* Next neural network unique ID. */
 #define NR_FLAG_BACKTRACK (1<<6)        /* Auto stop with backtracking. */
 
 /* Flags to persist when saving the NN. */
-#define NR_FLAG_TO_PRESIST (NR_FLAG_REGRESSOR| \
+#define NR_FLAG_TO_PERSIST (NR_FLAG_REGRESSOR| \
                             NR_FLAG_CLASSIFIER| \
                             NR_FLAG_NORMALIZE| \
                             NR_FLAG_OF_DETECTED)
@@ -91,7 +91,7 @@ typedef struct {
     uint64_t training_max_ms; /* Max time of a single training. */
     uint32_t flags;     /* NR_FLAG_... */
     uint32_t epochs;    /* Number of training epochs so far. */
-    struct Ann *nn;     /* Neural network structure. */
+    AnnRprop *nn;     /* Neural network structure. */
     NRDataset dataset;  /* Training dataset. */
     NRDataset test;     /* Testing dataset. */
     float dataset_error;   /* Average error in the training dataset. */
@@ -104,7 +104,7 @@ typedef struct {
     float *onorm;          /* Outputs normalization factors. */
 } NRTypeObject;
 
-struct {
+typedef struct {
     RedisModuleString *key; /* Key name of the NN we are training.
                                Set to NULL for unused slots. */
     int db_id;              /* DB ID where the key is. */
@@ -115,7 +115,7 @@ struct {
     float test_error;       /* Test error in the last cycle. */
     float class_error;      /* Percentage of wrong classifications. */
     int curcycle;           /* Current cycle. */
-} typedef NRPendingTraining;
+} NRPendingTraining;
 
 /* We take an array with NNs currently training in other threads.
  * Every time an NN command is called, we try to see if there are
@@ -152,8 +152,8 @@ NRTypeObject *createNRTypeObject(int flags, int *layers, int numlayers, int dset
     o->nn = AnnCreateNet(numlayers,layers);
     o->dataset.maxlen = dset_len;
     o->test.maxlen = test_len;
-    int ilen = INPUT_UNITS(o->nn);
-    int olen = OUTPUT_UNITS(o->nn);
+    int ilen = ANN_INPUT_UNITS(o->nn);
+    int olen = ANN_OUTPUT_UNITS(o->nn);
     o->inorm = RedisModule_Calloc(1,sizeof(float)*ilen);
     o->onorm = RedisModule_Calloc(1,sizeof(float)*olen);
     for (int j = 0; j < ilen; j++) o->inorm[j] = 1;
@@ -216,8 +216,8 @@ void NRTypeInsertData(NRTypeObject *o, float *inputs, float *outputs,
 
     /* Append if there is room or substitute with a random entry. */
     size_t idx;
-    int j, numin = INPUT_UNITS(o->nn),
-           numout = OUTPUT_UNITS(o->nn);
+    int j, numin = ANN_INPUT_UNITS(o->nn),
+           numout = ANN_OUTPUT_UNITS(o->nn);
 
     if (target->maxlen == target->len) {
         idx = rand() % target->maxlen;
@@ -275,8 +275,8 @@ NRTypeObject *NRClone(NRTypeObject *o, int newid) {
     copy->dataset = o->dataset;
     copy->test = o->test;
 
-    int ilen = INPUT_UNITS(o->nn);
-    int olen = OUTPUT_UNITS(o->nn);
+    int ilen = ANN_INPUT_UNITS(o->nn);
+    int olen = ANN_OUTPUT_UNITS(o->nn);
     copy->dataset.inputs = RedisModule_Alloc(sizeof(float)*ilen*o->dataset.len);
     copy->dataset.outputs = RedisModule_Alloc(sizeof(float)*olen*o->dataset.len);
     copy->test.inputs = RedisModule_Alloc(sizeof(float)*ilen*o->test.len);
@@ -318,8 +318,8 @@ void NRTransferWeights(RedisModuleCtx *ctx, NRTypeObject *dst, NRTypeObject *src
     dst->test_class_error = src->test_class_error;
     dst->flags |= src->flags & NR_FLAG_TO_TRANSFER;
 
-    int ilen = INPUT_UNITS(src->nn);
-    int olen = OUTPUT_UNITS(src->nn);
+    int ilen = ANN_INPUT_UNITS(src->nn);
+    int olen = ANN_OUTPUT_UNITS(src->nn);
     memcpy(dst->inorm,src->inorm,sizeof(float)*ilen);
     memcpy(dst->onorm,src->onorm,sizeof(float)*olen);
 }
@@ -361,8 +361,8 @@ void *NRTrainingThreadMain(void *arg) {
      * (NR_FLAG_CLASSIFIER), no output normalization will be done since
      * the data is already in 0/1 format. */
     if ((nr->flags & NR_FLAG_NORMALIZE) && nr->dataset.len) {
-        int ilen = INPUT_UNITS(nr->nn);
-        int olen = OUTPUT_UNITS(nr->nn);
+        int ilen = ANN_INPUT_UNITS(nr->nn);
+        int olen = ANN_OUTPUT_UNITS(nr->nn);
         float *imax = nr->inorm;
         float *omax = nr->onorm;
         float *inputs = nr->dataset.inputs;
@@ -410,7 +410,7 @@ void *NRTrainingThreadMain(void *arg) {
         }
     }
 
-    struct Ann *saved = NULL;  /* Saved to recover on overfitting. */
+    AnnRprop *saved = NULL;  /* Saved to recover on overfitting. */
     float saved_error;          /* The test error of the saved NN. */
     float saved_train_error;    /* The training dataset error of the saved NN */
     float saved_class_error;    /* The % of classification errors of saved NN */
@@ -424,7 +424,7 @@ void *NRTrainingThreadMain(void *arg) {
                                0,
                                training_iterations,
                                nr->dataset.len,
-                               NN_ALGO_BPROP);
+                               ANN_ALGO_BPROP);
         cycle_time = NRMilliseconds() - cycle_start;
         nr->training_total_steps += nr->dataset.len*training_iterations;
 
@@ -747,7 +747,7 @@ int NRGenericRun_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int
             "Use this command with a classifier network");
 
 
-    int ilen = INPUT_UNITS(nr->nn);
+    int ilen = ANN_INPUT_UNITS(nr->nn);
     if (argc != ilen+2)
         return RedisModule_ReplyWithError(ctx,
             "ERR number of arguments does not "
@@ -760,19 +760,19 @@ int NRGenericRun_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int
                 "ERR invalid neural network input: must be a valid float "
                 "precision floating point number");
         if (nr->flags & NR_FLAG_NORMALIZE) input /= nr->inorm[j];
-        INPUT_NODE(nr->nn,j) = input;
+        ANN_INPUT_NODE(nr->nn,j) = input;
     }
 
     AnnSimulate(nr->nn);
 
     /* Output the raw net output or the class ID if the network
      * is a classifier and the command invoked was NR.CLASS. */
-    int olen = OUTPUT_UNITS(nr->nn);
+    int olen = ANN_OUTPUT_UNITS(nr->nn);
     if (output_class) {
-        float max = OUTPUT_NODE(nr->nn,0);
+        float max = ANN_OUTPUT_NODE(nr->nn,0);
         int max_class = 0;
         for(int j = 1; j < olen; j++) {
-            float output = OUTPUT_NODE(nr->nn,j);
+            float output = ANN_OUTPUT_NODE(nr->nn,j);
             if (output > max) {
                 max = output;
                 max_class = j;
@@ -782,7 +782,7 @@ int NRGenericRun_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int
     } else {
         RedisModule_ReplyWithArray(ctx,olen);
         for(int j = 0; j < olen; j++) {
-            float output = OUTPUT_NODE(nr->nn,j);
+            float output = ANN_OUTPUT_NODE(nr->nn,j);
             if (!(nr->flags & NR_FLAG_CLASSIFIER) &&
                  (nr->flags & NR_FLAG_NORMALIZE))
             {
@@ -816,8 +816,8 @@ int NRObserve_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int ar
         return RedisModule_ReplyWithError(ctx,REDISMODULE_ERRORMSG_WRONGTYPE);
 
     NRTypeObject *nr = RedisModule_ModuleTypeGetValue(key);
-    int ilen = INPUT_UNITS(nr->nn);
-    int olen = OUTPUT_UNITS(nr->nn);
+    int ilen = ANN_INPUT_UNITS(nr->nn);
+    int olen = ANN_OUTPUT_UNITS(nr->nn);
     int oargs = (nr->flags & NR_FLAG_CLASSIFIER) ? 1 : olen;
     int target = NR_INSERT_NO_TARGET;
 
@@ -1017,9 +1017,9 @@ int NRInfo_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int argc)
     RedisModule_ReplyWithLongLong(ctx,!!(nr->flags & NR_FLAG_TRAINING));
 
     RedisModule_ReplyWithSimpleString(ctx,"layout");
-    RedisModule_ReplyWithArray(ctx,LAYERS(nr->nn));
-    for (int i = LAYERS(nr->nn)-1; i >= 0; i--) {
-        int units = UNITS(nr->nn,i);
+    RedisModule_ReplyWithArray(ctx,ANN_LAYERS(nr->nn));
+    for (int i = ANN_LAYERS(nr->nn)-1; i >= 0; i--) {
+        int units = ANN_UNITS(nr->nn,i);
         if (i != 0) units--; /* Don't count the bias unit. */
         RedisModule_ReplyWithLongLong(ctx,units);
     }
@@ -1111,8 +1111,8 @@ int NRGetdata_RedisCommand(RedisModuleCtx *ctx, RedisModuleString **argv, int ar
 
     NRTypeObject *nr = RedisModule_ModuleTypeGetValue(key);
 
-    int ilen = INPUT_UNITS(nr->nn);
-    int olen = OUTPUT_UNITS(nr->nn);
+    int ilen = ANN_INPUT_UNITS(nr->nn);
+    int olen = ANN_OUTPUT_UNITS(nr->nn);
     NRDataset *target = NULL;
     long long idx;
 
@@ -1173,15 +1173,15 @@ void NRTypeRdbSave(RedisModuleIO *rdb, void *value) {
     NRTypeObject *nr = value;
 
     /* Save the neural network layout. */
-    RedisModule_SaveUnsigned(rdb,LAYERS(nr->nn));
-    for (int j = 0; j < LAYERS(nr->nn); j++) {
-        int units = UNITS(nr->nn,j);
+    RedisModule_SaveUnsigned(rdb,ANN_LAYERS(nr->nn));
+    for (int j = 0; j < ANN_LAYERS(nr->nn); j++) {
+        int units = ANN_UNITS(nr->nn,j);
         if (j != 0) units--; /* Don't count the bias unit. */
         RedisModule_SaveUnsigned(rdb,units);
     }
 
     /* Save the object metadata. */
-    RedisModule_SaveUnsigned(rdb,nr->flags & NR_FLAG_TO_PRESIST);
+    RedisModule_SaveUnsigned(rdb,nr->flags & NR_FLAG_TO_PERSIST);
     RedisModule_SaveUnsigned(rdb,nr->id);
     RedisModule_SaveUnsigned(rdb,nr->training_total_steps);
     RedisModule_SaveUnsigned(rdb,nr->training_total_ms);
@@ -1193,8 +1193,8 @@ void NRTypeRdbSave(RedisModuleIO *rdb, void *value) {
 
     /* Save the neural network weights and biases. We start
      * at layer 1 since the first layer are just outputs. */
-    for (int j = 1; j < LAYERS(nr->nn); j++) {
-        int weights = WEIGHTS(nr->nn,j);
+    for (int j = 1; j < ANN_LAYERS(nr->nn); j++) {
+        int weights = ANN_WEIGHTS(nr->nn,j);
         for (int i = 0; i < weights; i++)
             RedisModule_SaveFloat(rdb,nr->nn->layer[j].weight[i]);
         for (int i = 0; i < weights; i++)
@@ -1204,8 +1204,8 @@ void NRTypeRdbSave(RedisModuleIO *rdb, void *value) {
     }
 
     /* Save the normalization vectors. */
-    uint32_t ilen = INPUT_UNITS(nr->nn);
-    uint32_t olen = OUTPUT_UNITS(nr->nn);
+    uint32_t ilen = ANN_INPUT_UNITS(nr->nn);
+    uint32_t olen = ANN_OUTPUT_UNITS(nr->nn);
     for (uint32_t j = 0; j < ilen; j++) RedisModule_SaveFloat(rdb,nr->inorm[j]);
     for (uint32_t j = 0; j < olen; j++) RedisModule_SaveFloat(rdb,nr->onorm[j]);
 
@@ -1261,8 +1261,8 @@ void *NRTypeRdbLoad(RedisModuleIO *rdb, int encver) {
     nr->test_class_error = RedisModule_LoadFloat(rdb);
 
     /* Load the neural network weights. */
-    for (int j = 1; j < LAYERS(nr->nn); j++) {
-        int weights = WEIGHTS(nr->nn,j);
+    for (int j = 1; j < ANN_LAYERS(nr->nn); j++) {
+        int weights = ANN_WEIGHTS(nr->nn,j);
         for (int i = 0; i < weights; i++)
             nr->nn->layer[j].weight[i] = RedisModule_LoadFloat(rdb);
         for (int i = 0; i < weights; i++)
@@ -1272,8 +1272,8 @@ void *NRTypeRdbLoad(RedisModuleIO *rdb, int encver) {
     }
 
     /* Load the normalization vector. */
-    uint32_t ilen = INPUT_UNITS(nr->nn);
-    uint32_t olen = OUTPUT_UNITS(nr->nn);
+    uint32_t ilen = ANN_INPUT_UNITS(nr->nn);
+    uint32_t olen = ANN_OUTPUT_UNITS(nr->nn);
     for (uint32_t j = 0; j < ilen; j++)
         nr->inorm[j] = RedisModule_LoadFloat(rdb);
     for (uint32_t j = 0; j < olen; j++)
diff --git a/nn.c b/nn.c
index 2476c06..1f4b542 100644
--- a/nn.c
+++ b/nn.c
@@ -35,6 +35,12 @@
 #include <time.h>
 #include <string.h>
 
+#include "nn.h"
+
+/*
+There is a problem with memory alignment when using avx and avx512
+on some machines avx aligned works fine on others don't
+*/
 #if defined(USE_AVX512)
 #define USING_SIMD
 #include <immintrin.h>
@@ -45,17 +51,19 @@ typedef __m512 simdf_t;
 #define simdf_zero() _mm512_setzero_ps()
 #define simdf_set1f(x) _mm512_set1_ps(x)
 #define simdf_loadu(x) _mm512_loadu_ps(x)
+#define simdf_load(x) _mm512_loadu_ps(x) //we are still using unaligned here
 #define simdf_mul(a,b) _mm512_mul_ps(a,b)
 #define simdf_add(a,b) _mm512_add_ps(a,b)
 #define simdf_storeu(a,b) _mm512_storeu_ps(a,b)
+#define simdf_store(a,b) _mm512_storeu_ps(a,b) //we are still using unaligned here
 
 //let the compiler optmize this
 #define simdf_sum(x) (x[0] + x[1] + x[2] + x[3] + x[4] + x[5] + x[6] + x[7] + \
-				x[8] + x[9] + x[10] + x[11] + x[12] + x[13] + x[14] + x[15])
+                                x[8] + x[9] + x[10] + x[11] + x[12] + x[13] + x[14] + x[15])
 
 #define simdf_show(x) printf("%d : %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f\n", \
-				__LINE__, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], \
-				x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
+                                __LINE__, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], \
+                                x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]);
 #endif
 
 #if defined(USE_AVX)
@@ -68,15 +76,17 @@ typedef __m256 simdf_t;
 #define simdf_zero() _mm256_setzero_ps()
 #define simdf_set1f(x) _mm256_set1_ps(x)
 #define simdf_loadu(x) _mm256_loadu_ps(x)
+#define simdf_load(x) _mm256_loadu_ps(x) //we are still using unaligned here
 #define simdf_mul(a,b) _mm256_mul_ps(a,b)
 #define simdf_add(a,b) _mm256_add_ps(a,b)
 #define simdf_storeu(a,b) _mm256_storeu_ps(a,b)
+#define simdf_store(a,b) _mm256_storeu_ps(a,b) //we are still using unaligned here
 
 //let the compiler optmize this
 #define simdf_sum(x) (x[0] + x[1] + x[2] + x[3] + x[4] + x[5] + x[6] + x[7])
 
 #define simdf_show(x) printf("%d : %f, %f, %f, %f, %f, %f, %f, %f\n", \
-				__LINE__, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
+                                __LINE__, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]);
 #endif
 
 #if defined(USE_SSE)
@@ -89,9 +99,11 @@ typedef __m128 simdf_t;
 #define simdf_zero() _mm_setzero_ps()
 #define simdf_set1f(x) _mm_set1_ps(x)
 #define simdf_loadu(x) _mm_loadu_ps(x)
+#define simdf_load(x) _mm_load_ps(x)
 #define simdf_mul(a,b) _mm_mul_ps(a,b)
 #define simdf_add(a,b) _mm_add_ps(a,b)
 #define simdf_storeu(a,b) _mm_storeu_ps(a,b)
+#define simdf_store(a,b) _mm_store_ps(a,b)
 
 //let the compiler optmize this
 #define simdf_sum(x) (x[0] + x[1] + x[2] + x[3])
@@ -103,15 +115,17 @@ typedef __m128 simdf_t;
 #define USING_SIMD
 #include <arm_neon.h>
 
-typedef float32x4_t simdf_t;
+typedef ann_float_t32x4_t simdf_t;
 #define  SIMDF_SIZE 4
 
 #define simdf_zero() vdupq_n_f32(0.0f)
 #define simdf_set1f(x) vdupq_n_f32(x);
 #define simdf_loadu(x) vld1q_f32(x)
+#define simdf_load(x) vld1q_f32(x)
 #define simdf_mul(a,b) vmulq_f32(a,b)
 #define simdf_add(a,b) vaddq_f32(a,b)
-#define simdf_storeu(a,b) vst1q_f32((float32_t*)a,b)
+#define simdf_storeu(a,b) vst1q_f32((ann_float_t32_t*)a,b)
+#define simdf_store(a,b) vst1q_f32((ann_float_t32_t*)a,b)
 
 //let the compiler optmize this
 #define simdf_sum(x) (x[0] + x[1] + x[2] + x[3])
@@ -119,20 +133,64 @@ typedef float32x4_t simdf_t;
 #define simdf_show(x) printf("%d : %f, %f, %f, %f\n", __LINE__, x[0], x[1], x[2], x[3]);
 #endif
 
-#include "nn.h"
+#ifndef SIMDF_SIZE
+#define SIMDF_SIZE 1
+#endif // SIMDF_SIZE
+
+#define ANN_SIZEOF_ann_float_t sizeof(ann_float_t)
+#define ANN_ALIGN_BASE (SIMDF_SIZE * ANN_SIZEOF_ann_float_t)
+#define ANN_ALIGN_ROUND(x) ((x%ANN_ALIGN_BASE) ? (((x/ANN_ALIGN_BASE)+1)*ANN_ALIGN_BASE) : (size_t)x)
+
+#ifndef HAS_ANN_MALLOC
+#define ann_malloc(x) malloc(x)
+#define ann_free(x) free(x)
+#else
+extern void *ann_malloc(size_t sz);
+extern void ann_free(void *ptr);
+#endif
+/*
+void *nnpmalloc(int line, size_t sz) {
+	printf("%d : %zu : %zu\n", line, sz, ANN_ALIGN_ROUND(sz));
+	return malloc(sz);
+}
+#define ann_malloc(x) nnpmalloc(__LINE__, x)
+*/
 
 /* Node Transfer Function */
-float sigmoid(float x) {
-    return (float)1/(1+exp(-x));
+ann_float_t AnnTransferFunctionSigmoid(ann_float_t x) {
+    return ((ann_float_t)1)/(1+exp(-x));
 }
 
-float relu(float x) {
+ann_float_t AnnTransferFunctionRelu(ann_float_t x) {
     return (x > 0) ? x : 0;
 }
 
+ann_float_t AnnTransferFunctionTanh(ann_float_t x) {
+    return tanh(x);
+}
+
+/*
+ann_float_t AnnDerivativeIdentity(ann_float_t x) {
+    return 1;
+}
+*/
+
+ann_float_t AnnDerivativeSigmoid(ann_float_t x) {
+    return x*(1-x);
+}
+
+ann_float_t AnnDerivativeTanh(ann_float_t x) {
+    return (1-x)*(1+x);
+}
+
+ann_float_t AnnDerivativeRelu(ann_float_t x) {
+    return (x > 0) ? 1 : 0;
+}
+
 /* Reset layer data to zero-units */
-void AnnResetLayer(struct AnnLayer *layer) {
+void AnnResetLayer(AnnLayer *layer) {
     layer->units = 0;
+    layer->units_aligned = 0;
     layer->output = NULL;
     layer->error = NULL;
     layer->weight = NULL;
@@ -143,24 +201,26 @@ void AnnResetLayer(struct AnnLayer *layer) {
 }
 
 /* Allocate and return an initialized N-layers network */
-struct Ann *AnnAlloc(int layers) {
-    struct Ann *net;
+AnnRprop *AnnAlloc(int layers) {
+    AnnRprop *net;
     int i;
 
     /* Alloc the net structure */
-    if ((net = malloc(sizeof(*net))) == NULL)
+    if ((net = ann_malloc(sizeof(*net))) == NULL)
         return NULL;
     /* Alloc layers */
-    if ((net->layer = malloc(sizeof(struct AnnLayer)*layers)) == NULL) {
-        free(net);
+    if ((net->layer = ann_malloc(sizeof(AnnLayer)*layers)) == NULL) {
+        ann_free(net);
         return NULL;
     }
     net->layers = layers;
     net->flags = 0;
-    net->rprop_nminus = DEFAULT_RPROP_NMINUS;
-    net->rprop_nplus = DEFAULT_RPROP_NPLUS;
-    net->rprop_maxupdate = DEFAULT_RPROP_MAXUPDATE;
-    net->rprop_minupdate = DEFAULT_RPROP_MINUPDATE;
+    net->rprop_nminus = ANN_DEFAULT_RPROP_NMINUS;
+    net->rprop_nplus = ANN_DEFAULT_RPROP_NPLUS;
+    net->rprop_maxupdate = ANN_DEFAULT_RPROP_MAXUPDATE;
+    net->rprop_minupdate = ANN_DEFAULT_RPROP_MINUPDATE;
+    net->node_transf_func = AnnTransferFunctionSigmoid;
+    net->derivative_func = AnnDerivativeSigmoid;
     /* Init layers */
     for (i = 0; i < layers; i++)
         AnnResetLayer(&net->layer[i]);
@@ -168,115 +228,114 @@ struct Ann *AnnAlloc(int layers) {
 }
 
 /* Free a single layer */
-void AnnFreeLayer(struct AnnLayer *layer)
+void AnnFreeLayer(AnnLayer *layer)
 {
-    free(layer->output);
-    free(layer->error);
-    free(layer->weight);
-    free(layer->gradient);
-    free(layer->pgradient);
-    free(layer->delta);
-    free(layer->sgradient);
+    ann_free(layer->output);
+    ann_free(layer->error);
+    ann_free(layer->weight);
+    ann_free(layer->gradient);
+    ann_free(layer->pgradient);
+    ann_free(layer->delta);
+    ann_free(layer->sgradient);
     AnnResetLayer(layer);
 }
 
 /* Free the target net */
-void AnnFree(struct Ann *net)
+void AnnFree(AnnRprop *net)
 {
     int i;
 
     /* Free layer data */
     for (i = 0; i < net->layers; i++) AnnFreeLayer(&net->layer[i]);
     /* Free allocated layers structures */
-    free(net->layer);
+    ann_free(net->layer);
     /* And the main structure itself */
-    free(net);
+    ann_free(net);
 }
 
 /* Init a layer of the net with the specified number of units.
  * Return non-zero on out of memory. */
-int AnnInitLayer(struct Ann *net, int i, int units, int bias) {
+int AnnInitLayer(AnnRprop *net, int i, int units, int bias) {
     if (bias) units++; /* Take count of the bias unit */
-    net->layer[i].output = malloc(sizeof(float)*units);
-    net->layer[i].error = malloc(sizeof(float)*units);
+    int ann_float_t_units = ANN_ALIGN_ROUND(units*ANN_SIZEOF_ann_float_t);
+    int units_aligned = ann_float_t_units/ANN_SIZEOF_ann_float_t;
+    int ann_float_t_units_units = 0;
+    AnnLayer *layer = &ANN_LAYER(net, i);
+    layer->units = units;
+    layer->units_aligned = units_aligned;
+    layer->output = ann_malloc(ann_float_t_units);
+    layer->error = ann_malloc(ann_float_t_units);
     if (i) { /* not for output layer */
-        net->layer[i].weight =
-            malloc(sizeof(float)*units*net->layer[i-1].units);
-        net->layer[i].gradient =
-            malloc(sizeof(float)*units*net->layer[i-1].units);
-        net->layer[i].pgradient =
-            malloc(sizeof(float)*units*net->layer[i-1].units);
-        net->layer[i].delta =
-            malloc(sizeof(float)*units*net->layer[i-1].units);
-        net->layer[i].sgradient =
-            malloc(sizeof(float)*units*net->layer[i-1].units);
-    }
-    net->layer[i].units = units;
+        ann_float_t_units_units = ann_float_t_units*ANN_LAYER(net, i-1).units;
+        layer->weight = ann_malloc(ann_float_t_units_units);
+        layer->gradient = ann_malloc(ann_float_t_units_units);
+        layer->pgradient = ann_malloc(ann_float_t_units_units);
+        layer->delta = ann_malloc(ann_float_t_units_units);
+        layer->sgradient = ann_malloc(ann_float_t_units_units);
+    }
     /* Check for out of memory conditions */
-    if (net->layer[i].output == NULL ||
-        net->layer[i].error == NULL ||
-        (i && net->layer[i].weight == NULL) ||
-        (i && net->layer[i].gradient == NULL) ||
-        (i && net->layer[i].pgradient == NULL) ||
-        (i && net->layer[i].sgradient == NULL) ||
-        (i && net->layer[i].delta == NULL))
+    if (layer->output == NULL ||
+        layer->error == NULL ||
+        (i && layer->weight == NULL) ||
+        (i && layer->gradient == NULL) ||
+        (i && layer->pgradient == NULL) ||
+        (i && layer->sgradient == NULL) ||
+        (i && layer->delta == NULL))
     {
-        AnnFreeLayer(&net->layer[i]);
-        AnnResetLayer(&net->layer[i]);
+        AnnFreeLayer(layer);
+        AnnResetLayer(layer);
         return 1;
     }
     /* Set all the values to zero */
-    memset(net->layer[i].output, 0, sizeof(float)*units);
-    memset(net->layer[i].error, 0, sizeof(float)*units);
+    memset(layer->output, 0, ann_float_t_units);
+    memset(layer->error, 0, ann_float_t_units);
     if (i) {
-        memset(net->layer[i].weight, 0,
-            sizeof(float)*units*net->layer[i-1].units);
-        memset(net->layer[i].gradient, 0,
-            sizeof(float)*units*net->layer[i-1].units);
-        memset(net->layer[i].pgradient, 0,
-            sizeof(float)*units*net->layer[i-1].units);
-        memset(net->layer[i].delta, 0,
-            sizeof(float)*units*net->layer[i-1].units);
-        memset(net->layer[i].sgradient, 0,
-            sizeof(float)*units*net->layer[i-1].units);
+        memset(layer->weight, 0, ann_float_t_units_units);
+        memset(layer->gradient, 0, ann_float_t_units_units);
+        memset(layer->pgradient, 0, ann_float_t_units_units);
+        memset(layer->delta, 0, ann_float_t_units_units);
+        memset(layer->sgradient, 0, ann_float_t_units_units);
     }
     /* Set the bias unit output to 1 */
-    if (bias) net->layer[i].output[units-1] = 1;
+    if (bias) layer->output[units-1] = 1;
     return 0;
 }
 
 /* Clone a network. On out of memory NULL is returned. */
-struct Ann *AnnClone(struct Ann* net) {
-    struct Ann* copy;
+AnnRprop *AnnClone(const AnnRprop* net) {
+    AnnRprop* copy;
     int j;
 
-    if ((copy = AnnAlloc(LAYERS(net))) == NULL) return NULL;
-    for (j = 0; j < LAYERS(net); j++) {
-        struct AnnLayer *ldst, *lsrc;
-        int units = UNITS(net,j);
+    if ((copy = AnnAlloc(ANN_LAYERS(net))) == NULL) return NULL;
+    for (j = 0; j < ANN_LAYERS(net); j++) {
+        AnnLayer *ldst;
+        const AnnLayer *lsrc;
+        int units = ANN_UNITS(net,j);
         int bias = j > 0;
         if (AnnInitLayer(copy, j, units-bias, bias)) {
             AnnFree(copy);
             return NULL;
         }
+        int ann_float_t_units = units*ANN_SIZEOF_ann_float_t;
         lsrc = &net->layer[j];
         ldst = &copy->layer[j];
         if (lsrc->output)
-            memcpy(ldst->output, lsrc->output, sizeof(float)*units);
+            memcpy(ldst->output, lsrc->output, ann_float_t_units);
         if (lsrc->error)
-            memcpy(ldst->error, lsrc->error, sizeof(float)*units);
+            memcpy(ldst->error, lsrc->error, ann_float_t_units);
         if (j) {
-            int weights = WEIGHTS(net,j);
+            int weights = ANN_WEIGHTS(net,j);
+            ann_float_t_units = weights*ANN_SIZEOF_ann_float_t;
             if (lsrc->weight)
-                memcpy(ldst->weight, lsrc->weight, sizeof(float)*weights);
+                memcpy(ldst->weight, lsrc->weight, ann_float_t_units);
             if (lsrc->gradient)
-                memcpy(ldst->gradient, lsrc->gradient, sizeof(float)*weights);
+                memcpy(ldst->gradient, lsrc->gradient, ann_float_t_units);
             if (lsrc->pgradient)
-                memcpy(ldst->pgradient, lsrc->pgradient, sizeof(float)*weights);
+                memcpy(ldst->pgradient, lsrc->pgradient, ann_float_t_units);
             if (lsrc->delta)
-                memcpy(ldst->delta, lsrc->delta, sizeof(float)*weights);
+                memcpy(ldst->delta, lsrc->delta, ann_float_t_units);
             if (lsrc->sgradient)
-                memcpy(ldst->sgradient, lsrc->sgradient, sizeof(float)*weights);
+                memcpy(ldst->sgradient, lsrc->sgradient, ann_float_t_units);
         }
     }
     copy->rprop_nminus = net->rprop_nminus;
@@ -284,14 +343,16 @@ struct Ann *AnnClone(struct Ann* net) {
     copy->rprop_maxupdate = net->rprop_maxupdate;
     copy->rprop_minupdate = net->rprop_minupdate;
     copy->flags = net->flags;
+    copy->node_transf_func = net->node_transf_func;
+    copy->derivative_func = net->derivative_func;
     return copy;
 }
 
 /* Create a N-layer input/hidden/output net.
  * The units array should specify the number of
  * units in every layer from the output to the input layer. */
-struct Ann *AnnCreateNet(int layers, int *units) {
-    struct Ann *net;
+AnnRprop *AnnCreateNet(int layers, int *units) {
+    AnnRprop *net;
     int i;
 
     if ((net = AnnAlloc(layers)) == NULL) return NULL;
@@ -302,17 +363,17 @@ struct Ann *AnnCreateNet(int layers, int *units) {
         }
     }
     AnnSetRandomWeights(net);
-    AnnSetDeltas(net, RPROP_INITIAL_DELTA);
-    LEARN_RATE(net) = DEFAULT_LEARN_RATE;
+    AnnSetDeltas(net, ANN_RPROP_INITIAL_DELTA);
+    ANN_LEARN_RATE(net) = ANN_DEFAULT_LEARN_RATE;
     return net;
 }
 
 /* Return the total number of weights this NN has. */
-size_t AnnCountWeights(struct Ann *net) {
+size_t AnnCountWeights(AnnRprop *net) {
     size_t weights = 0;
-    for (int i = net->layers-1; i > 0; i--) {
-        int nextunits = net->layer[i-1].units;
-        int units = net->layer[i].units;
+    for (int i = ANN_LAYERS(net)-1; i > 0; i--) {
+        int nextunits = ANN_UNITS(net, i-1);
+        int units = ANN_UNITS(net, i);
         if (i > 1) nextunits--; /* we don't output on bias units */
         weights += units*nextunits;
     }
@@ -320,7 +381,7 @@ size_t AnnCountWeights(struct Ann *net) {
 }
 
 /* Create a 4-layer input/hidden/output net */
-struct Ann *AnnCreateNet4(int iunits, int hunits, int hunits2, int ounits) {
+AnnRprop *AnnCreateNet4(int iunits, int hunits, int hunits2, int ounits) {
     int units[4];
 
     units[0] = ounits;
@@ -331,7 +392,7 @@ struct Ann *AnnCreateNet4(int iunits, int hunits, int hunits2, int ounits) {
 }
 
 /* Create a 3-layer input/hidden/output net */
-struct Ann *AnnCreateNet3(int iunits, int hunits, int ounits) {
+AnnRprop *AnnCreateNet3(int iunits, int hunits, int ounits) {
     int units[3];
 
     units[0] = ounits;
@@ -342,7 +403,7 @@ struct Ann *AnnCreateNet3(int iunits, int hunits, int ounits) {
 
 
 /* Create a 2-layer "linear" network. */
-struct Ann *AnnCreateNet2(int iunits, int ounits) {
+AnnRprop *AnnCreateNet2(int iunits, int ounits) {
     int units[2];
 
     units[0] = ounits;
@@ -351,107 +412,159 @@ struct Ann *AnnCreateNet2(int iunits, int ounits) {
 }
 
 
-void AnnSimulate(struct Ann *net) {
+void AnnSimulate(AnnRprop *net) {
     int i, j, k;
 
-    for (i = net->layers-1; i > 0; i--) {
-        int nextunits = net->layer[i-1].units;
-        int units = net->layer[i].units;
+    for (i = ANN_LAYERS(net)-1; i > 0; i--) {
+        AnnLayer *layer = &ANN_LAYER(net, i);
+        int nextunits = ANN_UNITS(net, i-1);
+        int units_aligned = layer->units_aligned;
+        int units = layer->units;
         if (i > 1) nextunits--; /* dont output on bias units */
+#ifdef USING_SIMD
+        int xps, psteps = units/SIMDF_SIZE;
+#endif // USING_SIMD
         for (j = 0; j < nextunits; j++) {
-            float A = 0; /* Activation final value. */
-            float *w = net->layer[i].weight + j*units;
-            float *o = net->layer[i].output;
+            ann_float_t A = 0; /* Activation final value. */
+            ann_float_t *w = layer->weight + j*units_aligned;
+            ann_float_t *o = layer->output;
 
             k = 0;
 
 #ifdef USING_SIMD
-            int psteps = units/SIMDF_SIZE;
-            simdf_t sumA = simdf_zero();
-            for (int x = 0; x < psteps; x++) {
-                simdf_t weights = simdf_loadu(w);
-                simdf_t outputs = simdf_loadu(o);
-                simdf_t prod = simdf_mul(weights,outputs);
-                sumA = simdf_add(sumA, prod);
-                w += SIMDF_SIZE;
-                o += SIMDF_SIZE;
+            if(psteps)
+            {
+                simdf_t sumA = simdf_zero();
+                for (xps = 0; xps < psteps; xps++) {
+                    simdf_t weights = simdf_load(w);
+                    simdf_t outputs = simdf_load(o);
+                    simdf_t prod = simdf_mul(weights,outputs);
+                    sumA = simdf_add(sumA, prod);
+                    w += SIMDF_SIZE;
+                    o += SIMDF_SIZE;
+                }
+                A += simdf_sum(sumA);
+                k += psteps*SIMDF_SIZE;
             }
-            A += simdf_sum(sumA);
-            k += SIMDF_SIZE*psteps;
 #endif
 
             /* Handle final piece shorter than SIMDF_SIZE . */
             for (; k < units; k++) {
                 A += (*w++) * (*o++);
             }
-            OUTPUT(net, i-1, j) = sigmoid(A);
+            //ANN_OUTPUT(net, i-1, j) = (*net->node_transf_func)(A); //sigmoid(A);
+	    	ANN_OUTPUT(net, i-1, j) = 1.0/(1.0+exp(-A));
         }
     }
 }
 
 /* Create a Tcl procedure that simulates the neural network */
-void Ann2Tcl(struct Ann *net) {
+void Ann2Tcl(const AnnRprop *net) {
     int i, j, k;
 
     printf("proc ann input {\n");
     printf("    set output {");
-    for (i = 0; i < OUTPUT_UNITS(net); i++) {
+    for (i = 0; i < ANN_OUTPUT_UNITS(net); i++) {
         printf("0 ");
     }
     printf("}\n");
-    for (i = net->layers-1; i > 0; i--) {
-        int nextunits = net->layer[i-1].units;
-        int units = net->layer[i].units;
-        if (i > 1) nextunits--; /* dont output on bias units */
+    printf("    proc sigmoid x {return [expr {1/(1+exp(-$x))}]}\n");
+    for(i=0, k=ANN_INPUT_UNITS(net); i < k; ++i) {
+      printf("    set input_%d [lindex $input %d]\n", i, i);
+    }
+    for (i = ANN_LAYERS(net)-1; i > 0; i--) {
+        int nextunits = ANN_UNITS(net, i-1);
+        int units = ANN_UNITS(net, i);
+        //if (i > 1) nextunits--; /* dont output on bias units */
         for (j = 0; j < nextunits; j++) {
-            float W;
+            ann_float_t W;
             if (i == 1) {
                 printf("    lset output %d ", j);
             } else {
                 printf("    set O_%d_%d", i-1, j);
             }
-            printf(" [expr { \\\n");
+            printf(" [sigmoid [expr { \\\n");
             for (k = 0; k < units; k++) {
-                W = WEIGHT(net, i, k, j);
+                W = ANN_WEIGHT(net, i, k, j);
                 if (i > 1 && k == units-1) {
                     printf("        (%.9f)", W);
-                } else if (i == net->layers-1) {
-                    printf("        (%.9f*[lindex $input %d])", W, k);
+                } else if (i == ANN_LAYERS(net)-1) {
+                    printf("        (%.9f*$input_%d)", W, k);
                 } else {
                     printf("        (%.9f*$O_%d_%d)", W, i, k);
                 }
                 if ((k+1) < units) printf("+ \\\n");
             }
-            printf("}]\n");
+            printf("}]]\n");
+        }
+    }
+    printf("    return $output\n");
+    printf("}\n");
+}
+
+/* Create a Javascript procedure that simulates the neural network */
+void Ann2Js(const AnnRprop *net) {
+    int i, j, k;
+
+    printf("function ann( input ) {\n");
+    printf("    var output = [");
+    for (i = 0; i < ANN_OUTPUT_UNITS(net); i++) {
+	if(i) printf(", ");
+        printf("0");
+    }
+    printf("];\n");
+    printf("    var sigmoid = function(x) {return 1.0/(1.0+Math.exp(-x));};\n");
+    for(i=0, k=ANN_INPUT_UNITS(net); i < k; ++i) {
+      printf("    var input_%d = input[%d];\n", i, i);
+    }
+    for (i = ANN_LAYERS(net)-1; i > 0; i--) {
+        int nextunits = ANN_UNITS(net, i-1);
+        int units = ANN_UNITS(net, i);
+        //if (i > 1) nextunits--; /* dont output on bias units */
+        for (j = 0; j < nextunits; j++) {
+            ann_float_t W;
             if (i == 1) {
-                printf("    lset output %d [expr {1/(1+exp(-[lindex $output %d]))}]\n", j, j);
+                printf("    output[%d]", j);
             } else {
-                printf("    lset O_%d_%d [expr {1/(1+exp(-$O_%d_%d))}]\n", i-1, j, i-1, j);
+                printf("    var O_%d_%d", i-1, j);
+            }
+            printf(" = sigmoid(\n");
+            for (k = 0; k < units; k++) {
+                W = ANN_WEIGHT(net, i, k, j);
+                if (i > 1 && k == units-1) {
+                    printf("        (%.9f)", W);
+                } else if (i == ANN_LAYERS(net)-1) {
+                    printf("        (%.9f*input_%d)", W, k);
+                } else {
+                    printf("        (%.9f*O_%d_%d)", W, i, k);
+                }
+                if ((k+1) < units) printf("+\n");
             }
+            printf(");\n");
         }
     }
-    printf("    return $output\n");
+    printf("    return output;\n");
     printf("}\n");
 }
 
 /* Print a network representation */
-void AnnPrint(struct Ann *net) {
+void AnnPrint(const AnnRprop *net) {
     int i, j, k;
 
-    for (i = 0; i < LAYERS(net); i++) {
+    for (i = 0; i < ANN_LAYERS(net); i++) {
         char *layertype = "Hidden";
         if (i == 0) layertype = "Output";
-        if (i == LAYERS(net)-1) layertype = "Input";
-        printf("%s layer %d, units %d\n", layertype, i, UNITS(net,i));
+        if (i == ANN_LAYERS(net)-1) layertype = "Input";
+        printf("%s layer %d, units %d\n", layertype, i, ANN_UNITS(net,i));
         if (i) {
             /* Don't compute the bias unit as a target. */
-            int targets = UNITS(net,i-1) - (i-1>0);
+            int targets = ANN_UNITS(net,i-1) - (i-1>0);
             /* Weights */
             printf("\tW");
-            for (j = 0; j < UNITS(net, i); j++) {
+            for (j = 0; j < ANN_UNITS(net, i); j++) {
                 printf("(");
                 for (k = 0; k < targets; k++) {
-                    printf("%f", WEIGHT(net,i,j,k));
+                    printf("%f", ANN_WEIGHT(net,i,j,k));
                     if (k != targets-1) printf(" ");
                 }
                 printf(") ");
@@ -459,10 +572,10 @@ void AnnPrint(struct Ann *net) {
             printf("\n");
             /* Gradients */
             printf("\tg");
-            for (j = 0; j < UNITS(net, i); j++) {
+            for (j = 0; j < ANN_UNITS(net, i); j++) {
                 printf("[");
                 for (k = 0; k < targets; k++) {
-                    printf("%f", GRADIENT(net,i,j,k));
+                    printf("%f", ANN_GRADIENT(net,i,j,k));
                     if (k != targets-1) printf(" ");
                 }
                 printf("] ");
@@ -470,10 +583,10 @@ void AnnPrint(struct Ann *net) {
             printf("\n");
             /* SGradients */
             printf("\tG");
-            for (j = 0; j < UNITS(net, i); j++) {
+            for (j = 0; j < ANN_UNITS(net, i); j++) {
                 printf("[");
                 for (k = 0; k < targets; k++) {
-                    printf("%f", SGRADIENT(net,i,j,k));
+                    printf("%f", ANN_SGRADIENT(net,i,j,k));
                     if (k != targets-1) printf(" ");
                 }
                 printf("] ");
@@ -481,10 +594,10 @@ void AnnPrint(struct Ann *net) {
             printf("\n");
             /* Gradients at t-1 */
             printf("\tP");
-            for (j = 0; j < UNITS(net, i); j++) {
+            for (j = 0; j < ANN_UNITS(net, i); j++) {
                 printf("[");
                 for (k = 0; k < targets; k++) {
-                    printf("%f", PGRADIENT(net,i,j,k));
+                    printf("%f", ANN_PGRADIENT(net,i,j,k));
                     if (k != targets-1) printf(" ");
                 }
                 printf("] ");
@@ -492,23 +605,23 @@ void AnnPrint(struct Ann *net) {
             printf("\n");
             /* Delta */
             printf("\tD");
-            for (j = 0; j < UNITS(net, i); j++) {
+            for (j = 0; j < ANN_UNITS(net, i); j++) {
                 printf("|");
                 for (k = 0; k < targets; k++) {
-                    printf("%f", DELTA(net,i,j,k));
+                    printf("%f", ANN_DELTA(net,i,j,k));
                     if (k != targets-1) printf(" ");
                 }
                 printf("| ");
             }
             printf("\n");
         }
-        for (j = 0; j < UNITS(net,i); j++) {
-            printf("\tO: %f ", OUTPUT(net,i,j));
+        for (j = 0; j < ANN_UNITS(net,i); j++) {
+            printf("\tO: %f ", ANN_OUTPUT(net,i,j));
         }
         printf("\n");
         printf("\tE /");
-        for (j = 0; j < UNITS(net,i); j++) {
-            printf("%f ", ERROR(net,i,j));
+        for (j = 0; j < ANN_UNITS(net,i); j++) {
+            printf("%f ", ANN_ERROR(net,i,j));
         }
         printf("/\n");
     }
@@ -517,28 +630,28 @@ void AnnPrint(struct Ann *net) {
 /* Calcuate the global error of the net. This is just the
  * Root Mean Square (RMS) error, which is half the sum of the squared
  * errors. */
-float AnnGlobalError(struct Ann *net, float *desired) {
-    float e, t;
-    int i, outputs = OUTPUT_UNITS(net);
+ann_float_t AnnGlobalError(AnnRprop *net, ann_float_t *desired) {
+    ann_float_t e, t;
+    int i, outputs = ANN_OUTPUT_UNITS(net);
 
     e = 0;
     for (i = 0; i < outputs; i++) {
-        t = desired[i] - OUTPUT_NODE(net,i);
+        t = desired[i] - ANN_OUTPUT_NODE(net,i);
         e += t*t; /* No need for fabs(t), t*t will always be positive. */
     }
     return .5*e;
 }
 
 /* Set the network input */
-void AnnSetInput(struct Ann *net, float *input)
+void AnnSetInput(AnnRprop *net, ann_float_t *input)
 {
-    int i, inputs = INPUT_UNITS(net);
+    int i, inputs = ANN_INPUT_UNITS(net);
 
-    for (i = 0; i < inputs; i++) INPUT_NODE(net,i) = input[i];
+    for (i = 0; i < inputs; i++) ANN_INPUT_NODE(net,i) = input[i];
 }
 
 /* Simulate the net, and return the global error */
-float AnnSimulateError(struct Ann *net, float *input, float *desired) {
+ann_float_t AnnSimulateError(AnnRprop *net, ann_float_t *input, ann_float_t *desired) {
     AnnSetInput(net, input);
     AnnSimulate(net);
     return AnnGlobalError(net, desired);
@@ -546,12 +659,12 @@ float AnnSimulateError(struct Ann *net, float *input, float *desired) {
 
 /* Compute the error vector y-t in the output unit. This error depends
  * on the loss function we use. */
-void AnnCalculateOutputError(struct Ann *net, float *desired) {
-    int units = OUTPUT_UNITS(net);
-    float factor = (float)2/units;
+void AnnCalculateOutputError(AnnRprop *net, ann_float_t *desired) {
+    int units = ANN_OUTPUT_UNITS(net);
+    ann_float_t factor = (ann_float_t)2/units;
+    AnnLayer *layer = &ANN_LAYER(net, 0);
     for (int j = 0; j < units; j++) {
-        net->layer[0].error[j] =
-            factor * (net->layer[0].output[j] - desired[j]);
+        layer->error[j] = factor * (layer->output[j] - desired[j]);
     }
 }
 
@@ -563,35 +676,35 @@ void AnnCalculateOutputError(struct Ann *net, float *desired) {
  * points (E1, with the real weight, and E2 with the weight W = W + 0.1),
  * than the approximation of the gradient is G = (E2-E1)/0.1. */
 #define GTRIVIAL_DELTA 0.001
-void AnnCalculateGradientsTrivial(struct Ann *net, float *desired) {
-    int j, i, layers = LAYERS(net);
+void AnnCalculateGradientsTrivial(AnnRprop *net, ann_float_t *desired) {
+    int j, i, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1);
+        int weights = ANN_WEIGHTS(net,j);
         for (i = 0; i < weights; i++) {
-            float t, e1, e2;
+            ann_float_t t, e1, e2;
+            AnnLayer *layer = &ANN_LAYER(net,j);
 
             /* Calculate the value of the error function
              * in this point. */
             AnnSimulate(net);
             e1 = AnnGlobalError(net, desired);
-            t = net->layer[j].weight[i];
+            t = layer->weight[i];
             /* Calculate the error a bit on the right */
-            net->layer[j].weight[i] += GTRIVIAL_DELTA;
+            layer->weight[i] += GTRIVIAL_DELTA;
             AnnSimulate(net);
             e2 = AnnGlobalError(net, desired);
             /* Restore the original weight */
-            net->layer[j].weight[i] = t;
+            layer->weight[i] = t;
             /* Calculate the gradient */
-            net->layer[j].gradient[i] = (e2-e1)/GTRIVIAL_DELTA;
+            layer->gradient[i] = (e2-e1)/GTRIVIAL_DELTA;
         }
     }
 }
 
 /* Calculate gradients using the back propagation algorithm */
-void AnnCalculateGradients(struct Ann *net, float *desired) {
-    int j, layers = LAYERS(net)-1;
+void AnnCalculateGradients(AnnRprop *net, ann_float_t *desired) {
+    int j, layers = ANN_LAYERS(net)-1;
 
     /* Populate the error vector net->layer[0]->error according
      * to the loss function. */
@@ -600,19 +713,25 @@ void AnnCalculateGradients(struct Ann *net, float *desired) {
     /* Back-propagate the error and compute the gradient
      * for every weight in the net. */
     for (j = 0; j < layers; j++) {
-        struct AnnLayer *layer = &net->layer[j];
-        struct AnnLayer *prev_layer = &net->layer[j+1];
+        AnnLayer *layer = &ANN_LAYER(net, j);
+        AnnLayer *prev_layer = &ANN_LAYER(net, j+1);
         int i, units = layer->units;
         int prevunits = prev_layer->units;
 
+        int prevunits_aligned = prev_layer->units_aligned;
+#ifdef USING_SIMD
+        int xps, psteps = prevunits/SIMDF_SIZE;
+        simdf_t es;
+#endif // USING_SIMD
         /* Skip bias units, they have no connections with the previous
          * layers. */
         if (j > 1) units--;
         /* Reset the next layer errors array */
-        for (i = 0; i < prevunits; i++) prev_layer->error[i] = 0;
+        //for (i = 0; i < prevunits; i++) prev_layer->error[i] = 0;
+        memset(prev_layer->error, 0, ANN_SIZEOF_ann_float_t*prevunits);
         /* For every node in this layer ... */
         for (i = 0; i < units; i++) {
-            float error_signal, ei, oi, derivative;
+            ann_float_t error_signal, ei, oi, derivative;
             int k;
 
             /* Compute gradient. */
@@ -627,31 +746,34 @@ void AnnCalculateGradients(struct Ann *net, float *desired) {
              * tanh:    (1-oi)*(1+oi), that's 1-(oi*oi)
              * relu:    (oi > 0) ? 1 : 0
              */
-            derivative = oi*(1-oi);
+            //derivative = oi*(1-oi);
+            derivative = (*net->derivative_func)(oi);
             error_signal = ei*derivative;
 
             /* For every weight between this node and
              * the previous layer's nodes: */
-            float *g = prev_layer->gradient + i*prevunits;
-            float *w = prev_layer->weight + i*prevunits;
-            float *o = prev_layer->output;
-            float *e = prev_layer->error;
+            ann_float_t *g = prev_layer->gradient + i*prevunits_aligned;
+            ann_float_t *w = prev_layer->weight + i*prevunits_aligned;
+            ann_float_t *o = prev_layer->output;
+            ann_float_t *e = prev_layer->error;
 
             /* 1. Calculate the gradient */
             k = 0;
 
 #ifdef USING_SIMD
-            simdf_t es = simdf_set1f(error_signal);
-
-            int psteps = prevunits/SIMDF_SIZE;
-            for (int x = 0; x < psteps; x++) {
-                simdf_t outputs = simdf_loadu(o);
-                //simdf_t gradients = simdf_mul(es,outputs);
-                simdf_storeu(g,simdf_mul(es,outputs));
-                o += SIMDF_SIZE;
-                g += SIMDF_SIZE;
+            if(psteps)
+            {
+                es = simdf_set1f(error_signal);
+//printf("%d : %ld\n", __LINE__, ((long)o & 15));
+                for (xps = 0; xps < psteps; xps++) {
+                    simdf_t outputs = simdf_load(o);
+                    simdf_t gradients = simdf_mul(es,outputs);
+                    simdf_store(g, gradients);
+                    o += SIMDF_SIZE;
+                    g += SIMDF_SIZE;
+                }
+                k += psteps*SIMDF_SIZE;
             }
-            k += SIMDF_SIZE*psteps;
 #endif
             /* Handle final piece shorter than SIMDF_SIZE . */
             for (; k < prevunits; k++) *g++ = error_signal*(*o++);
@@ -659,15 +781,18 @@ void AnnCalculateGradients(struct Ann *net, float *desired) {
             /* 2. And back-propagate the error to the previous layer */
             k = 0;
 #ifdef USING_SIMD
-            for (int x = 0; x < psteps; x++) {
-                simdf_t weights = simdf_loadu(w);
-                simdf_t errors = simdf_loadu(e);
-                //simdf_t prod = simdf_mul(es, weights);
-                simdf_storeu(e, simdf_add( simdf_mul(es, weights), errors));
-                e += SIMDF_SIZE;
-                w += SIMDF_SIZE;
+            if(psteps)
+            {
+                for (xps = 0; xps < psteps; xps++) {
+                    simdf_t weights = simdf_load(w);
+                    simdf_t errors = simdf_load(e);
+                    simdf_t prod = simdf_mul(es, weights);
+                    simdf_store(e, simdf_add(prod , errors));
+                    e += SIMDF_SIZE;
+                    w += SIMDF_SIZE;
+                }
+                k += psteps*SIMDF_SIZE;
             }
-            k += SIMDF_SIZE*psteps;
 #endif
             /* Handle final piece shorter than SIMDF_SIZE . */
             for (; k < prevunits; k++) {
@@ -678,77 +803,92 @@ void AnnCalculateGradients(struct Ann *net, float *desired) {
 }
 
 /* Set the delta values of the net to a given value */
-void AnnSetDeltas(struct Ann *net, float val) {
-    int j, layers = LAYERS(net);
+void AnnSetDeltas(AnnRprop *net, ann_float_t val) {
+    int j, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1);
+        int weights = ANN_WEIGHTS(net,j);
         int i;
 
-        for (i = 0; i < weights; i++) net->layer[j].delta[i] = val;
+        AnnLayer *layer = &ANN_LAYER(net, j);
+        for (i = 0; i < weights; i++) layer->delta[i] = val;
     }
 }
 
 /* Set the sgradient values to zero */
-void AnnResetSgradient(struct Ann *net) {
-    int j, layers = LAYERS(net);
+void AnnResetSgradient(AnnRprop *net) {
+    int j, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1);
-        memset(net->layer[j].sgradient, 0, sizeof(float)*weights);
+        int weights = ANN_WEIGHTS(net, j);
+        memset(ANN_LAYER(net, j).sgradient, 0, ANN_SIZEOF_ann_float_t*weights);
     }
 }
 
 /* Set random weights in the range -0.05,+0.05 */
-void AnnSetRandomWeights(struct Ann *net) {
+void AnnSetRandomWeights(AnnRprop *net) {
     int i, j, k;
 
-    for (i = 1; i < LAYERS(net); i++) {
-        for (k = 0; k < UNITS(net, i-1); k++) {
-            for (j = 0; j < UNITS(net, i); j++) {
-                WEIGHT(net,i,j,k) = -0.05+.1*(rand()/(RAND_MAX+1.0));
+    for (i = 1; i < ANN_LAYERS(net); i++) {
+        for (k = 0; k < ANN_UNITS(net, i-1); k++) {
+            for (j = 0; j < ANN_UNITS(net, i); j++) {
+                ANN_WEIGHT(net,i,j,k) = -0.05+.1*(rand()/(RAND_MAX+1.0));
             }
         }
     }
 }
 
 /* Scale the net weights of the given factor */
-void AnnScaleWeights(struct Ann *net, float factor) {
-    int j, layers = LAYERS(net);
+void AnnScaleWeights(AnnRprop *net, ann_float_t factor) {
+    int j, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1);
+        int weights = ANN_WEIGHTS(net,j);
         int i;
 
+        AnnLayer *layer = &ANN_LAYER(net, j);
         for (i = 0; i < weights; i++)
-            net->layer[j].weight[i] *= factor;
+            layer->weight[i] *= factor;
     }
 }
 
 /* Update the sgradient, that's the sum of the weight's gradient for every
  * element of the training set. This is used for the RPROP algorithm
  * that works with the sign of the derivative for the whole set. */
-void AnnUpdateSgradient(struct Ann *net) {
-    int j, i, layers = LAYERS(net);
+void AnnUpdateSgradient(AnnRprop *net) {
+    int j, i, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1);
-        /* In theory this is a good target for SSE "ADDPS" instructions,
-         * however modern compilers figure out this automatically. */
-        for (i = 0; i < weights; i++)
-            net->layer[j].sgradient[i] += net->layer[j].gradient[i];
+        int weights = ANN_WEIGHTS(net,j);
+        ann_float_t *sg = net->layer[j].sgradient;
+        ann_float_t *g = net->layer[j].gradient;
+        i = 0;
+#ifdef USING_SIMD
+            int psteps = weights/SIMDF_SIZE;
+            if(psteps)
+            {
+                int xps;
+                for (xps = 0; xps < psteps; xps++) {
+                    simdf_t sgradient = simdf_load(sg);
+                    simdf_t gradient = simdf_load(g);
+                    simdf_store(sg, simdf_add( sgradient, gradient));
+                    sg += SIMDF_SIZE;
+                    g += SIMDF_SIZE;
+                }
+                i += psteps*SIMDF_SIZE;
+            }
+#endif
+        /* Handle final piece shorter than SIMDF_SIZE . */
+        for (; i < weights; i++)
+            (*sg++) += (*g++);
     }
 }
 
 /* Helper function for RPROP, returns -1 if n < 0, +1 if n > 0, 0 if n == 0 */
-float sign(float n) {
-    if (n > 0) return +1;
-    if (n < 0) return -1;
-    return 0;
+static inline ann_float_t sign(ann_float_t n) {
+    if (n > 0) return +1.0;
+    if (n < 0) return -1.0;
+    return 0.0;
 }
 
 /* The core of the RPROP algorithm.
@@ -756,42 +896,42 @@ float sign(float n) {
  * Note that:
  * sgradient is the set-wise gradient.
  * delta is the per-weight update value. */
-void AnnAdjustWeightsResilientBP(struct Ann *net) {
-    int j, i, layers = LAYERS(net);
+void AnnAdjustWeightsResilientBP(AnnRprop *net) {
+    int j, i, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1) - (j-1>0);
+        int weights = ANN_WEIGHTS(net,j) - (j-1>0);
+        AnnLayer *layer = &ANN_LAYER(net, j);
         for (i = 0; i < weights; i++) {
-            float t = net->layer[j].pgradient[i] *
-                       net->layer[j].sgradient[i];
-            float delta = net->layer[j].delta[i];
+            ann_float_t sgradient = layer->sgradient[i];
+            ann_float_t t = layer->pgradient[i] * sgradient;
+            ann_float_t delta = layer->delta[i];
 
             if (t > 0) {
-                delta = MIN(delta*RPROP_NPLUS(net),RPROP_MAXUPDATE(net));
-                float wdelta = -sign(net->layer[j].sgradient[i]) * delta;
-                net->layer[j].weight[i] += wdelta;
-                net->layer[j].delta[i] = delta;
-                net->layer[j].pgradient[i] = net->layer[j].sgradient[i];
+                delta = ANN_MIN(delta*ANN_RPROP_NPLUS(net),ANN_RPROP_MAXUPDATE(net));
+                ann_float_t wdelta = -sign(sgradient) * delta;
+                layer->weight[i] += wdelta;
+                layer->delta[i] = delta;
+                layer->pgradient[i] = sgradient;
             } else if (t < 0) {
-                float past_wdelta = -sign(net->layer[j].pgradient[i]) * delta;
-                delta = MAX(delta*RPROP_NMINUS(net),RPROP_MINUPDATE(net));
-                net->layer[j].weight[i] -= past_wdelta;
-                net->layer[j].delta[i] = delta;
-                net->layer[j].pgradient[i] = 0;
+                ann_float_t past_wdelta = -sign(layer->pgradient[i]) * delta;
+                delta = ANN_MAX(delta*ANN_RPROP_NMINUS(net),ANN_RPROP_MINUPDATE(net));
+                layer->weight[i] -= past_wdelta;
+                layer->delta[i] = delta;
+                layer->pgradient[i] = 0;
             } else { /* t == 0 */
-                float wdelta = -sign(net->layer[j].sgradient[i]) * delta;
-                net->layer[j].weight[i] += wdelta;
-                net->layer[j].pgradient[i] = net->layer[j].sgradient[i];
+                ann_float_t wdelta = -sign(sgradient) * delta;
+                layer->weight[i] += wdelta;
+                layer->pgradient[i] = sgradient;
             }
         }
     }
 }
 
 /* Resilient Backpropagation Epoch */
-float AnnResilientBPEpoch(struct Ann *net, float *input, float *desired, int setlen) {
-    float error = 0;
-    int j, inputs = INPUT_UNITS(net), outputs = OUTPUT_UNITS(net);
+ann_float_t AnnResilientBPEpoch(AnnRprop *net, ann_float_t *input, ann_float_t *desired, int setlen) {
+    ann_float_t error = 0;
+    int j, inputs = ANN_INPUT_UNITS(net), outputs = ANN_OUTPUT_UNITS(net);
 
     AnnResetSgradient(net);
     for (j = 0; j < setlen; j++) {
@@ -807,34 +947,34 @@ float AnnResilientBPEpoch(struct Ann *net, float *input, float *desired, int set
 
 /* Update the deltas using the gradient descend algorithm.
  * Gradients should be already computed with AnnCalculateGraidents(). */
-void AnnUpdateDeltasGD(struct Ann *net) {
-    int j, i, layers = LAYERS(net);
+void AnnUpdateDeltasGD(AnnRprop *net) {
+    int j, i, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1);
+        int weights = ANN_WEIGHTS(net,j);
+        AnnLayer *layer = &ANN_LAYER(net, j);
         for (i = 0; i < weights; i++)
-            net->layer[j].delta[i] += net->layer[j].gradient[i];
+            layer->delta[i] += layer->gradient[i];
     }
 }
 
 /* Adjust net weights using the (already) calculated deltas. */
-void AnnAdjustWeights(struct Ann *net, int setlen) {
-    int j, i, layers = LAYERS(net);
+void AnnAdjustWeights(AnnRprop *net, int setlen) {
+    int j, i, layers = ANN_LAYERS(net);
 
     for (j = 1; j < layers; j++) {
-        int units = UNITS(net, j);
-        int weights = units * UNITS(net,j-1);
+        int weights = ANN_WEIGHTS(net,j);
+        AnnLayer *layer = &ANN_LAYER(net, j);
         for (i = 0; i < weights; i++) {
-            net->layer[j].weight[i] -= LEARN_RATE(net)/setlen*net->layer[j].delta[i];
+            layer->weight[i] -= ANN_LEARN_RATE(net)/setlen*layer->delta[i];
         }
     }
 }
 
 /* Gradient Descend training */
-float AnnGDEpoch(struct Ann *net, float *input, float *desidered, int setlen) {
-    float error = 0;
-    int j, inputs = INPUT_UNITS(net), outputs = OUTPUT_UNITS(net);
+ann_float_t AnnGDEpoch(AnnRprop *net, ann_float_t *input, ann_float_t *desidered, int setlen) {
+    ann_float_t error = 0;
+    int j, inputs = ANN_INPUT_UNITS(net), outputs = ANN_OUTPUT_UNITS(net);
 
     for (j = 0; j < setlen; j++) {
         AnnSetDeltas(net, 0);
@@ -851,10 +991,10 @@ float AnnGDEpoch(struct Ann *net, float *input, float *desidered, int setlen) {
 /* This function, called after AnnSimulate(), will return 1 if there is
  * an error in the detected class (compared to the desired output),
  * othewise 0 is returned. */
-int AnnTestClassError(struct Ann *net, float *desired) {
-    int i, outputs = OUTPUT_UNITS(net);
+int AnnTestClassError(AnnRprop *net, ann_float_t *desired) {
+    int i, outputs = ANN_OUTPUT_UNITS(net);
     int classid, outid;
-    float max = 0;
+    ann_float_t max = 0;
 
     /* Get the class ID from the test dataset output. */
     classid = 0;
@@ -863,10 +1003,10 @@ int AnnTestClassError(struct Ann *net, float *desired) {
     classid = i;
 
     /* Get the network classification. */
-    max = OUTPUT_NODE(net,0);
+    max = ANN_OUTPUT_NODE(net,0);
     outid = 0;
     for (i = 1; i < outputs; i++) {
-        float o = OUTPUT_NODE(net,i);
+        ann_float_t o = ANN_OUTPUT_NODE(net,i);
         if (o > max) {
             outid = i;
             max = o;
@@ -877,9 +1017,9 @@ int AnnTestClassError(struct Ann *net, float *desired) {
 
 /* Simulate the entire test dataset with the neural network and returns the
  * average error of all the entries tested. */
-void AnnTestError(struct Ann *net, float *input, float *desired, int setlen, float *avgerr, float *classerr) {
-    float error = 0;
-    int j, inputs = INPUT_UNITS(net), outputs = OUTPUT_UNITS(net);
+void AnnTestError(AnnRprop *net, ann_float_t *input, ann_float_t *desired, int setlen, ann_float_t *avgerr, ann_float_t *classerr) {
+    ann_float_t error = 0;
+    int j, inputs = ANN_INPUT_UNITS(net), outputs = ANN_OUTPUT_UNITS(net);
     int class_errors = 0;
 
     for (j = 0; j < setlen; j++) {
@@ -890,20 +1030,28 @@ void AnnTestError(struct Ann *net, float *input, float *desired, int setlen, flo
         desired += outputs;
     }
     if (avgerr) *avgerr = error/setlen;
-    if (classerr) *classerr = (float)class_errors*100/setlen;
+    if (classerr) *classerr = (ann_float_t)class_errors*100/setlen;
 }
 
 /* Train the net */
-float AnnTrain(struct Ann *net, float *input, float *desired, float maxerr, int maxepochs, int setlen, int algo) {
+ann_float_t AnnTrainWithAlgoFunc(AnnRprop *net, ann_float_t *input, ann_float_t *desired, ann_float_t maxerr,
+                                        int maxepochs, int setlen, AnnTrainAlgoFunc algo_func) {
     int i = 0;
-    float e = maxerr+1;
+    ann_float_t e = maxerr+1;
 
     while (i++ < maxepochs && e >= maxerr) {
-        if (algo == NN_ALGO_BPROP) {
-            e = AnnResilientBPEpoch(net, input, desired, setlen);
-        } else if (algo == NN_ALGO_GD) {
-            e = AnnGDEpoch(net, input, desired, setlen);
-        }
+        e = (*algo_func)(net, input, desired, setlen);
     }
     return e;
 }
+
+
+ann_float_t AnnTrain(AnnRprop *net, ann_float_t *input, ann_float_t *desired, ann_float_t maxerr, int maxepochs,
+                                                                                int setlen, int algo) {
+    AnnTrainAlgoFunc algo_func;
+    if(algo == ANN_ALGO_BPROP) algo_func = AnnResilientBPEpoch;
+    else if(algo == ANN_ALGO_GD) algo_func = AnnGDEpoch;
+    else return -1;
+
+    return AnnTrainWithAlgoFunc(net, input, desired, maxerr, maxepochs, setlen, algo_func);
+}
diff --git a/nn.h b/nn.h
index 8106daf..23348c9 100644
--- a/nn.h
+++ b/nn.h
@@ -32,106 +32,128 @@
 #ifndef __NN_H
 #define __NN_H
 
+//#include <assert.h>
+
+typedef float ann_float_t;
+typedef ann_float_t (*AnnDerivativeFunc)(ann_float_t v);
 /* Data structures.
  * Nets are not so 'dynamic', but enough to support
  * an arbitrary number of layers, with arbitrary units for layer.
  * Only fully connected feed-forward networks are supported. */
-struct AnnLayer {
-	int units;
-	float *output;		/* output[i], output of i-th unit */
-	float *error;		/* error[i], output error of i-th unit*/
-	float *weight;		/* weight[(i*units)+j] */
+typedef struct {
+	ann_float_t *output;		/* output[i], output of i-th unit */
+	ann_float_t *error;		/* error[i], output error of i-th unit*/
+	ann_float_t *weight;		/* weight[(i*units)+j] */
 				/* weight between unit i-th and next j-th */
-	float *gradient;	/* gradient[(i*units)+j] gradient */
-	float *sgradient;	/* gradient for the full training set */
+	ann_float_t *gradient;	/* gradient[(i*units)+j] gradient */
+	ann_float_t *sgradient;	/* gradient for the full training set */
 				/* only used for RPROP */
-	float *pgradient;	/* pastgradient[(i*units)+j] t-1 gradient */
+	ann_float_t *pgradient;	/* pastgradient[(i*units)+j] t-1 gradient */
 				/* (t-1 sgradient for resilient BP) */
-	float *delta;		/* delta[(i*units)+j] cumulative update */
+	ann_float_t *delta;		/* delta[(i*units)+j] cumulative update */
 				/* (per-weight delta for RPROP) */
-};
+	int units;	/*moved to last position for alignment purposes*/
+	int units_aligned; /*units rounded up for alignment*/
+} AnnLayer;
 
 /* Feed forward network structure */
-struct Ann {
+typedef struct {
+	AnnLayer *layer;
 	int flags;
 	int layers;
-	float rprop_nminus;
-	float rprop_nplus;
-	float rprop_maxupdate;
-	float rprop_minupdate;
-        float learn_rate; /* Used for GD training. */
-	struct AnnLayer *layer;
-};
+	AnnDerivativeFunc node_transf_func;
+	AnnDerivativeFunc derivative_func;
+	ann_float_t rprop_nminus;
+	ann_float_t rprop_nplus;
+	ann_float_t rprop_maxupdate;
+	ann_float_t rprop_minupdate;
+	ann_float_t learn_rate; /* Used for GD training. */
+} AnnRprop;
+
+typedef ann_float_t (*AnnTrainAlgoFunc)(AnnRprop *net, ann_float_t *input, ann_float_t *desired, int setlen);
 
 /* Raw interface to data structures */
-#define OUTPUT(net,l,i) (net)->layer[l].output[i]
-#define ERROR(net,l,i) (net)->layer[l].error[i]
-#define WEIGHT(net,l,i,j) (net)->layer[l].weight[((j)*(net)->layer[l].units)+(i)]
-#define GRADIENT(net,l,i,j) (net)->layer[l].gradient[((j)*(net)->layer[l].units)+(i)]
-#define SGRADIENT(net,l,i,j) (net)->layer[l].sgradient[((j)*(net)->layer[l].units)+(i)]
-#define PGRADIENT(net,l,i,j) (net)->layer[l].pgradient[((j)*(net)->layer[l].units)+(i)]
-#define DELTA(net,l,i,j) (net)->layer[l].delta[((j)*(net)->layer[l].units)+(i)]
-#define LAYERS(net) (net)->layers
-#define UNITS(net,l) (net)->layer[l].units
-#define WEIGHTS(net,l) (UNITS(net,l)*UNITS(net,l-1))
-#define OUTPUT_NODE(net,i) OUTPUT(net,0,i)
-#define INPUT_NODE(net,i) OUTPUT(net,((net)->layers)-1,i)
-#define OUTPUT_UNITS(net) UNITS(net,0)
-#define INPUT_UNITS(net) (UNITS(net,((net)->layers)-1)-1)
-#define RPROP_NMINUS(net) (net)->rprop_nminus
-#define RPROP_NPLUS(net) (net)->rprop_nplus
-#define RPROP_MAXUPDATE(net) (net)->rprop_maxupdate
-#define RPROP_MINUPDATE(net) (net)->rprop_minupdate
-#define LEARN_RATE(net) (net)->learn_rate
+#define ANN_LAYERS(net) (net)->layers
+#define ANN_LAYER(net, l) (net)->layer[/*assert(l >= 0),*/l]
+#define ANN_OUTPUT(net,l,i) ANN_LAYER(net, l).output[i]
+#define ANN_ERROR(net,l,i) ANN_LAYER(net, l).error[i]
+#define ANN_LAYER_IDX(net,l,i,j) (((j)*ANN_LAYER(net, l).units_aligned)+(i))
+#define ANN_WEIGHT(net,l,i,j) ANN_LAYER(net, l).weight[ANN_LAYER_IDX(net,l,i,j)]
+#define ANN_GRADIENT(net,l,i,j) ANN_LAYER(net, l).gradient[ANN_LAYER_IDX(net,l,i,j)]
+#define ANN_SGRADIENT(net,l,i,j) ANN_LAYER(net, l).sgradient[ANN_LAYER_IDX(net,l,i,j)]
+#define ANN_PGRADIENT(net,l,i,j) ANN_LAYER(net, l).pgradient[ANN_LAYER_IDX(net,l,i,j)]
+#define ANN_DELTA(net,l,i,j) ANN_LAYER(net, l).delta[ANN_LAYER_IDX(net,l,i,j)]
+#define ANN_UNITS(net,l) ANN_LAYER(net, l).units
+#define ANN_UNITS_ALLOCATED(net,l) ANN_LAYER(net, l).units_aligned
+#define ANN_WEIGHTS(net,l) (ANN_UNITS(net,l)*ANN_UNITS(net,l-1))
+#define ANN_OUTPUT_NODE(net,i) ANN_OUTPUT(net,0,i)
+#define ANN_INPUT_NODE(net,i) ANN_OUTPUT(net,(ANN_LAYERS(net))-1,i)
+#define ANN_OUTPUT_UNITS(net) ANN_UNITS(net,0)
+#define ANN_INPUT_UNITS(net) (ANN_UNITS(net,(ANN_LAYERS(net))-1)-1)
+#define ANN_RPROP_NMINUS(net) (net)->rprop_nminus
+#define ANN_RPROP_NPLUS(net) (net)->rprop_nplus
+#define ANN_RPROP_MAXUPDATE(net) (net)->rprop_maxupdate
+#define ANN_RPROP_MINUPDATE(net) (net)->rprop_minupdate
+#define ANN_LEARN_RATE(net) (net)->learn_rate
 
 /* Constants */
-#define DEFAULT_RPROP_NMINUS 0.5
-#define DEFAULT_RPROP_NPLUS 1.2
-#define DEFAULT_RPROP_MAXUPDATE 50
-#define DEFAULT_RPROP_MINUPDATE 0.000001
-#define RPROP_INITIAL_DELTA 0.1
-#define DEFAULT_LEARN_RATE 0.1
-#define NN_ALGO_BPROP 0
-#define NN_ALGO_GD 1
+#define ANN_DEFAULT_RPROP_NMINUS 0.5
+#define ANN_DEFAULT_RPROP_NPLUS 1.2
+#define ANN_DEFAULT_RPROP_MAXUPDATE 50
+#define ANN_DEFAULT_RPROP_MINUPDATE 0.000001
+#define ANN_RPROP_INITIAL_DELTA 0.1
+#define ANN_DEFAULT_LEARN_RATE 0.1
+#define ANN_ALGO_BPROP 0
+#define ANN_ALGO_GD 1
 
 /* Misc */
-#define MAX(a,b) (((a)>(b))?(a):(b))
-#define MIN(a,b) (((a)<(b))?(a):(b))
+#define ANN_MAX(a,b) (((a)>(b))?(a):(b))
+#define ANN_MIN(a,b) (((a)<(b))?(a):(b))
 
 /* Prototypes */
-void AnnResetLayer(struct AnnLayer *layer);
-struct Ann *AnnAlloc(int layers);
-void AnnFreeLayer(struct AnnLayer *layer);
-void AnnFree(struct Ann *net);
-int AnnInitLayer(struct Ann *net, int i, int units, int bias);
-struct Ann *AnnCreateNet(int layers, int *units);
-struct Ann *AnnCreateNet2(int iunits, int ounits);
-struct Ann *AnnCreateNet3(int iunits, int hunits, int ounits);
-struct Ann *AnnCreateNet4(int iunits, int hunits, int hunits2, int ounits);
-struct Ann *AnnClone(struct Ann* net);
-size_t AnnCountWeights(struct Ann *net);
-void AnnSimulate(struct Ann *net);
-void Ann2Tcl(struct Ann *net);
-void AnnPrint(struct Ann *net);
-float AnnGlobalError(struct Ann *net, float *desidered);
-void AnnSetInput(struct Ann *net, float *input);
-float AnnSimulateError(struct Ann *net, float *input, float *desidered);
-void AnnCalculateGradientsTrivial(struct Ann *net, float *desidered);
-void AnnCalculateGradients(struct Ann *net, float *desidered);
-void AnnSetDeltas(struct Ann *net, float val);
-void AnnResetDeltas(struct Ann *net);
-void AnnResetSgradient(struct Ann *net);
-void AnnSetRandomWeights(struct Ann *net);
-void AnnScaleWeights(struct Ann *net, float factor);
-void AnnUpdateDeltasGD(struct Ann *net);
-void AnnUpdateDeltasGDM(struct Ann *net);
-void AnnUpdateSgradient(struct Ann *net);
-void AnnAdjustWeights(struct Ann *net, int setlen);
-float AnnBatchGDEpoch(struct Ann *net, float *input, float *desidered, int setlen);
-float AnnBatchGDMEpoch(struct Ann *net, float *input, float *desidered, int setlen);
-void AnnAdjustWeightsResilientBP(struct Ann *net);
-float AnnResilientBPEpoch(struct Ann *net, float *input, float *desidered, int setlen);
-float AnnTrain(struct Ann *net, float *input, float *desidered, float maxerr, int maxepochs, int setlen, int algo);
-void AnnTestError(struct Ann *net, float *input, float *desired, int setlen, float *avgerr, float *classerr);
+ann_float_t AnnTransferFunctionSigmoid(ann_float_t x);
+ann_float_t AnnTransferFunctionRelu(ann_float_t x);
+ann_float_t AnnTransferFunctionTanh(ann_float_t x);
+//ann_float_t AnnDerivativeIdentity(ann_float_t x);
+ann_float_t AnnDerivativeSigmoid(ann_float_t x);
+ann_float_t AnnDerivativeTanh(ann_float_t x);
+ann_float_t AnnDerivativeRelu(ann_float_t x);
+
+void AnnResetLayer(AnnLayer *layer);
+AnnRprop *AnnAlloc(int layers);
+void AnnFreeLayer(AnnLayer *layer);
+void AnnFree(AnnRprop *net);
+int AnnInitLayer(AnnRprop *net, int i, int units, int bias);
+AnnRprop *AnnCreateNet(int layers, int *units);
+AnnRprop *AnnCreateNet2(int iunits, int ounits);
+AnnRprop *AnnCreateNet3(int iunits, int hunits, int ounits);
+AnnRprop *AnnCreateNet4(int iunits, int hunits, int hunits2, int ounits);
+AnnRprop *AnnClone(const AnnRprop* net);
+size_t AnnCountWeights(AnnRprop *net);
+void AnnSimulate(AnnRprop *net);
+void Ann2Tcl(const AnnRprop *net);
+void Ann2Js(const AnnRprop *net);
+void AnnPrint(const AnnRprop *net);
+ann_float_t AnnGlobalError(AnnRprop *net, ann_float_t *desidered);
+void AnnSetInput(AnnRprop *net, ann_float_t *input);
+ann_float_t AnnSimulateError(AnnRprop *net, ann_float_t *input, ann_float_t *desidered);
+void AnnCalculateGradientsTrivial(AnnRprop *net, ann_float_t *desidered);
+void AnnCalculateGradients(AnnRprop *net, ann_float_t *desidered);
+void AnnSetDeltas(AnnRprop *net, ann_float_t val);
+void AnnResetDeltas(AnnRprop *net);
+void AnnResetSgradient(AnnRprop *net);
+void AnnSetRandomWeights(AnnRprop *net);
+void AnnScaleWeights(AnnRprop *net, ann_float_t factor);
+void AnnUpdateDeltasGD(AnnRprop *net);
+void AnnUpdateDeltasGDM(AnnRprop *net);
+void AnnUpdateSgradient(AnnRprop *net);
+void AnnAdjustWeights(AnnRprop *net, int setlen);
+ann_float_t AnnBatchGDEpoch(AnnRprop *net, ann_float_t *input, ann_float_t *desidered, int setlen);
+ann_float_t AnnBatchGDMEpoch(AnnRprop *net, ann_float_t *input, ann_float_t *desidered, int setlen);
+void AnnAdjustWeightsResilientBP(AnnRprop *net);
+ann_float_t AnnResilientBPEpoch(AnnRprop *net, ann_float_t *input, ann_float_t *desidered, int setlen);
+ann_float_t AnnTrainWithAlgoFunc(AnnRprop *net, ann_float_t *input, ann_float_t *desidered, ann_float_t maxerr, int maxepochs, int setlen, AnnTrainAlgoFunc algo_func);
+ann_float_t AnnTrain(AnnRprop *net, ann_float_t *input, ann_float_t *desidered, ann_float_t maxerr, int maxepochs, int setlen, int algo);
+void AnnTestError(AnnRprop *net, ann_float_t *input, ann_float_t *desired, int setlen, ann_float_t *avgerr, ann_float_t *classerr);
 
 #endif /* __NN_H */
diff --git a/tests/Makefile b/tests/Makefile
index 574c80f..0584996 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,13 +1,13 @@
 all: nn-test-1 nn-test-2 nn-benchmark
 
 nn-test-1: nn-test-1.c ../nn.c ../nn.h
-	$(CC) nn-test-1.c ../nn.c -Wall -W -O2 -o nn-test-1
+	$(CC) nn-test-1.c ../nn.c -Wall -W -O2 -o nn-test-1 -lm
 
 nn-test-2: nn-test-2.c ../nn.c ../nn.h
-	$(CC) nn-test-2.c ../nn.c -Wall -W -O2 -o nn-test-2
+	$(CC) nn-test-2.c ../nn.c -Wall -W -O2 -o nn-test-2 -lm
 
 nn-benchmark: nn-benchmark.c ../nn.c ../nn.h
-	$(CC) -DUSE_SSE nn-benchmark.c ../nn.c -Wall -W -O3 -o nn-benchmark
+	$(CC) -DUSE_SSE nn-benchmark.c ../nn.c -Wall -W -O3 -o nn-benchmark -lm
 
 clean:
 	rm -f nn-test-1 nn-test-2 nn-benchmark
diff --git a/tests/nn-benchmark.c b/tests/nn-benchmark.c
index 80868ff..ed2c1c6 100644
--- a/tests/nn-benchmark.c
+++ b/tests/nn-benchmark.c
@@ -21,35 +21,45 @@ long long mstime(void) {
     return ust/1000;
 }
 
-void gen_dataset(struct Ann *nn, float **inputs, float **outputs, int setsize) {
-    *inputs = malloc(sizeof(float)*setsize*NUM_INPUTS);
-    *outputs = malloc(sizeof(float)*setsize*NUM_INPUTS);
-    int ilen = INPUT_UNITS(nn);
-    int olen = OUTPUT_UNITS(nn);
+int math_random(int low, int up) {
+  ann_float_t r = rand() * (1.0 / (RAND_MAX + 1.0));
+  r *= (up - low) + 1.0;
+  return (int)r+low;
+}
+
+void gen_dataset(AnnRprop *nn, ann_float_t **inputs, ann_float_t **outputs, int setsize) {
+    *inputs = calloc(1, sizeof(ann_float_t)*setsize*NUM_INPUTS);
+    *outputs = calloc(1, sizeof(ann_float_t)*setsize*NUM_OUTPUTS);
+    int ilen = ANN_INPUT_UNITS(nn);
+    int olen = ANN_OUTPUT_UNITS(nn);
+    int olen_1 = olen - 1;
 
-    float *in = *inputs;
-    float *out = *outputs;
+    ann_float_t *in = *inputs;
+    ann_float_t *out = *outputs;
     for (int j = 0; j < setsize; j++) {
         for (int k = 0; k < ilen; k++) in[k] = rand() & 1;
-        int r = rand() & olen;
-        for (int k = 0; k < olen; k++) {
-            out[k] = (k == r) ? 1 : 0;
-        }
+        //int r = rand() & olen_1;
+        int r = math_random(0, olen_1);
+		out[r] = 1; 
+		//printf("%d : %d\n", j, r);
+        //for (int k = 0; k < olen; k++) {
+        //    out[k] = (k == r) ? 1 : 0;
+        //}
         in+= ilen;
         out+= olen;
     }
 }
 
 int main(void) {
-    struct Ann *nn = AnnCreateNet3(NUM_INPUTS, NUM_INPUTS*2, NUM_OUTPUTS);
-    float *inputs, *outputs;
+    AnnRprop *nn = AnnCreateNet3(NUM_INPUTS, NUM_INPUTS*2, NUM_OUTPUTS);
+    ann_float_t *inputs, *outputs;
     int setsize = 1000;
 
     nn->learn_rate = 0.5;
     gen_dataset(nn, &inputs, &outputs, setsize);
 
     int j;
-    float classerr = 100;
+    ann_float_t classerr = 100;
     long long totaltime = 0;
     int benchmark_milestone = 0;
     for (j = 0; j < 1000000; j++) {
@@ -60,11 +70,12 @@ int main(void) {
             benchmark_milestone = 1;
         }
         long long start = mstime();
-        AnnTrain(nn,inputs,outputs,0,1,setsize,NN_ALGO_BPROP);
+        AnnTrain(nn,inputs,outputs,0,1,setsize,ANN_ALGO_BPROP);
         long long elapsed = mstime() - start;
         totaltime += elapsed;
 
         AnnTestError(nn,inputs,outputs,setsize,NULL,&classerr);
     }
+    AnnFree(nn);
     return 0;
 }
diff --git a/tests/nn-test-1.c b/tests/nn-test-1.c
index 1da9d29..58e5eae 100644
--- a/tests/nn-test-1.c
+++ b/tests/nn-test-1.c
@@ -10,34 +10,36 @@
 #include "../nn.h"
 
 int main(void) {
-    struct Ann *nn = AnnCreateNet3(2, 2, 2);
-    float inputs[2] = {.05,.10};
-    float desired[2] = {.01,.99};
+    AnnRprop *nn = AnnCreateNet3(2, 2, 2);
+    ann_float_t inputs[2] = {.05,.10};
+    ann_float_t desired[2] = {.01,.99};
 
     nn->learn_rate = 0.5;
 
     /* Input layer. */
-    WEIGHT(nn,2,0,0) = .15;
-    WEIGHT(nn,2,0,1) = .25;
+    ANN_WEIGHT(nn,2,0,0) = .15;
+    ANN_WEIGHT(nn,2,0,1) = .25;
 
-    WEIGHT(nn,2,1,0) = .20;
-    WEIGHT(nn,2,1,1) = .30;
+    ANN_WEIGHT(nn,2,1,0) = .20;
+    ANN_WEIGHT(nn,2,1,1) = .30;
 
-    WEIGHT(nn,2,2,0) = .35;
-    WEIGHT(nn,2,2,1) = .35;
+    ANN_WEIGHT(nn,2,2,0) = .35;
+    ANN_WEIGHT(nn,2,2,1) = .36;
 
-    INPUT_NODE(nn,0) = inputs[0];
-    INPUT_NODE(nn,1) = inputs[1];
+    ANN_INPUT_NODE(nn,0) = inputs[0];
+    ANN_INPUT_NODE(nn,1) = inputs[1];
 
     /* Hidden layer. */
-    WEIGHT(nn,1,0,0) = .40;
-    WEIGHT(nn,1,0,1) = .50;
+    ANN_WEIGHT(nn,1,0,0) = .40;
+    ANN_WEIGHT(nn,1,0,1) = .50;
 
-    WEIGHT(nn,1,1,0) = .45;
-    WEIGHT(nn,1,1,1) = .55;
+    ANN_WEIGHT(nn,1,1,0) = .45;
+    ANN_WEIGHT(nn,1,1,1) = .55;
 
-    WEIGHT(nn,1,2,0) = .60;
-    WEIGHT(nn,1,2,1) = .60;
+    ANN_WEIGHT(nn,1,2,0) = .60;
+    ANN_WEIGHT(nn,1,2,1) = .61;
+
+    AnnPrint(nn);
 
     int j;
     for (j = 0; j < 10000; j++) {
@@ -56,5 +58,10 @@ int main(void) {
     }
     printf("\nAfter training:\n\n");
     AnnPrint(nn);
+    printf("\nTCL simulation:\n\n");
+    Ann2Tcl(nn);
+    Ann2Js(nn);
+
+    AnnFree(nn);
     return 0;
 }
diff --git a/tests/nn-test-2.c b/tests/nn-test-2.c
index 888f6b7..74e70f8 100644
--- a/tests/nn-test-2.c
+++ b/tests/nn-test-2.c
@@ -10,22 +10,26 @@
 #include "../nn.h"
 
 int main(void) {
-    struct Ann *nn = AnnCreateNet3(2, 3, 1);
-    float inputs[8] = {0,0, 1,0, 0,1, 1,1};
-    float desired[4] = {0, 1, 1, 0};
+    AnnRprop *nn = AnnCreateNet3(2, 3, 1);
+    ann_float_t inputs[8] = {0,0, 1,0, 0,1, 1,1};
+    ann_float_t desired[4] = {0, 1, 1, 0};
 
     nn->learn_rate = 0.5;
 
     int j;
     for (j = 0; j < 100000; j++) {
-        float error = AnnTrain(nn, inputs, desired, 0, 1, 4, NN_ALGO_GD);
+        ann_float_t error = AnnTrain(nn, inputs, desired, 0, 1, 4, ANN_ALGO_GD);
         printf("Error: %f\n", error);
     }
     printf("\nAfter training:\n\n");
     for (j = 0; j < 4; j++) {
         AnnSetInput(nn,inputs+j*2);
         AnnSimulate(nn);
-        printf("%f\n", OUTPUT_NODE(nn,0));
+        printf("%f\n", ANN_OUTPUT_NODE(nn,0));
     }
+    printf("\nTCL simulation:\n\n");
+    Ann2Tcl(nn);
+    Ann2Js(nn);
+    AnnFree(nn);
     return 0;
 }