#include <stdlib.h>
#include <stdarg.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <omp.h>

typedef struct Ann Ann;
typedef struct Layer Layer;
typedef struct Neuron Neuron;
typedef struct Weights Weights;

struct Ann {
	int n;
	double rate;
	Layer **layers;
	Weights **weights;
	Weights **deltas;
};

struct Layer {
	int n;
	Neuron **neurons;
};

struct Neuron {
	double (*activation)(Neuron*);
	double (*gradient)(Neuron*);
	double steepness;
	double value;
	double sum;
};

struct Weights {
	int inputs;
	int outputs;
	double **values;
};

double activation_sigmoid(Neuron*);
double gradient_sigmoid(Neuron*);
double activation_tanh(Neuron*);
double gradient_tanh(Neuron*);
double activation_leaky_relu(Neuron*);
double gradient_leaky_relu(Neuron*);

Ann *anncreate(int, ...);
Layer *layercreate(int, double(*)(Neuron*), double(*)(Neuron*));
Neuron *neuroninit(Neuron*, double (*)(Neuron*), double (*)(Neuron*), double);
Neuron *neuroncreate(double (*)(Neuron*), double (*)(Neuron*), double);
Weights *weightsinitrand(Weights*);
Weights *weightsinitrandscale(Weights*, double);
Weights *weightsinitdouble(Weights*, double);
Weights *weightsinitdoubles(Weights*, double*);
Weights *weightscreate(int, int, int);
double *annrun(Ann*, double*);
double anntrain(Ann*, double*, double*);

double
activation_sigmoid(Neuron *in)
{
	return 1.0/(1.0+exp(-in->sum));
}

double
gradient_sigmoid(Neuron *in)
{
	double y = in->value;
	return y * (1.0 - y);
}

double
activation_tanh(Neuron *in)
{
	return tanh(in->sum);
}

double
gradient_tanh(Neuron *in)
{
	return 1.0 - in->value*in->value;
}

double
activation_leaky_relu(Neuron *in)
{
	if (in->sum > 0)
		return in->sum;
	return in->sum * 0.01;
}

double
gradient_leaky_relu(Neuron *in)
{
	if (in->sum > 0)
		return 1.0;
	return 0.01;
}

Weights*
weightsinitdoubles(Weights *in, double *init)
{
	int i, o;

	for (i = 0; i <= in->inputs; i++)
		for (o = 0; o < in->outputs; o++)
			in->values[i][o] = init[o];

	return in;
}

Weights*
weightsinitdouble(Weights *in, double init)
{
	int i, o;

	for (i = 0; i <= in->inputs; i++)
		for (o = 0; o < in->outputs; o++)
			in->values[i][o] = init;

	return in;
}

Weights*
weightsinitrandscale(Weights *in, double scale)
{
	int i, o;

	srand(time(0));
	for (i = 0; i <= in->inputs; i++)
		for (o = 0; o < in->outputs; o++)
			in->values[i][o] = (((double)rand()/RAND_MAX) - 0.5) * scale;

	return in;
}

Weights*
weightsinitrand(Weights *in)
{
	weightsinitrandscale(in, 1.0);
	return in;
}

Neuron*
neuroninit(Neuron *in, double (*activation)(Neuron*), double (*gradient)(Neuron*), double steepness)
{
	in->activation = activation;
	in->gradient = gradient;
	in->steepness = steepness;
	in->value = 1.0;
	in->sum = 0;
	return in;
}

Neuron*
neuroncreate(double (*activation)(Neuron*), double (*gradient)(Neuron*), double steepness)
{
	Neuron *ret = calloc(1, sizeof(Neuron));
	neuroninit(ret, activation, gradient, steepness);
	return ret;
}

Layer*
layercreate(int num_neurons, double(*activation)(Neuron*), double(*gradient)(Neuron*))
{
	Layer *ret = calloc(1, sizeof(Layer));
	int i;

	ret->n = num_neurons;
	ret->neurons = calloc(num_neurons+1, sizeof(Neuron*));
	for (i = 0; i <= ret->n; i++) {
		ret->neurons[i] = neuroncreate(activation, gradient, 1.0);
	}
	return ret;
}

Weights*
weightscreate(int inputs, int outputs, int initialize)
{
	int i;
	Weights *ret = calloc(1, sizeof(Weights));
	ret->inputs = inputs;
	ret->outputs = outputs;
	ret->values = calloc(inputs+1, sizeof(double*));
	for (i = 0; i <= inputs; i++)
		ret->values[i] = calloc(outputs, sizeof(double));
	if (initialize)
		weightsinitrand(ret);
	else
		weightsinitdouble(ret, 1.0);
	return ret;
}

Ann*
anncreate(int num_layers, ...)
{
	Ann *ret = calloc(1, sizeof(Ann));
	va_list args;
	int arg;
	int i;

	va_start(args, num_layers);
	ret->n = num_layers;
	ret->rate = 0.25;
	ret->layers = calloc(num_layers, sizeof(Layer*));
	ret->weights = calloc(num_layers-1, sizeof(Weights*));
	ret->deltas = calloc(num_layers-1, sizeof(Weights*));

	for (i = 0; i < num_layers; i++) {
		arg = va_arg(args, int);
		if (arg < 0 || arg > 1000000)
			arg = 0;
		ret->layers[i] = layercreate(arg, activation_leaky_relu, gradient_leaky_relu);
		if (i > 0) {
			ret->weights[i-1] = weightscreate(ret->layers[i-1]->n, ret->layers[i]->n, 1);
			ret->deltas[i-1] = weightscreate(ret->layers[i-1]->n, ret->layers[i]->n, 0);
		}
	}	va_end(args);

	return ret;
}

double*
annrun(Ann *ann, double *input)
{
	int l, i, o;
	int outputs = ann->layers[ann->n - 1]->n;
	double *ret = calloc(outputs, sizeof(double));
	Neuron *O;

	#pragma omp parallel for shared(ann) private(i)
	for (i = 0; i < ann->layers[0]->n; i++)
		ann->layers[0]->neurons[i]->value = input[i];

	for (l = 1; l < ann->n; l++) {
		for (o = 0; o < ann->layers[l]->n; o++) {
			O = ann->layers[l]->neurons[o];
			O->sum = ann->weights[l-1]->values[ann->weights[l-1]->inputs][o]; // bias
			#pragma omp parallel for shared(ann) private(i)
			for (i = 0; i < ann->layers[l-1]->n; i++)
				O->sum += ann->layers[l-1]->neurons[i]->value * ann->weights[l-1]->values[i][o];
			O->value = O->activation(O);
		}
	}

	#pragma omp parallel for shared(ret,ann) private(o)
	for (o = 0; o < outputs; o++)
		ret[o] = ann->layers[ann->n - 1]->neurons[o]->value;

	return ret;
}

double
anntrain(Ann *ann, double *inputs, double *outputs)
{
	double *error = annrun(ann, inputs);
	double ret = 0.0;
	int noutputs = ann->layers[ann->n-1]->n;
	double acc, sum;
	int o, i, w, n;
	Neuron *O, *I;
	Weights *W, *D, *D2;

	#pragma omp parallel for shared(error) private(o)
	for (o = 0; o < noutputs; o++) {
		// error = outputs[o] - result
		error[o] -= outputs[o];
		error[o] = -error[o];
		ret += pow(error[o], 2.0) * 0.5;
	}
	D = ann->deltas[ann->n-2];
	weightsinitdoubles(D, error);
	#pragma omp parallel for shared(ann) private(i)
	for (i = 0; i < (ann->n-2); i++) {
		D = ann->deltas[i];
		weightsinitdouble(D, 1.0);
	}

	// backpropagate MSE
	D2 = ann->deltas[ann->n-2];
	for (w = ann->n-2; w >= 0; w--) {
		D = ann->deltas[w];

		for (o = 0; o < ann->layers[w+1]->n; o++) {
			O = ann->layers[w+1]->neurons[o];
			acc = O->gradient(O) * O->steepness;
			sum = 1.0;
			if (D2 != D) {
				W = ann->weights[w + 1];
				sum = 0.0;
				#pragma omp parallel for shared(D2,W,sum) private(n)
				for (n = 0; n < D2->outputs; n++)
					sum += D2->values[o][n] * W->values[o][n];
			}
			#pragma omp parallel for shared(D) private(i)
			for (i = 0; i <= ann->layers[w]->n; i++) {
			 	D->values[i][o] *= acc * sum;
			}
		}

		D2 = D;
	}

	// update weights
	#pragma omp parallel for shared(ann) private(w,W,D,i,o,I)
	for (w = 0; w < ann->n-1; w++) {
		W = ann->weights[w];
		D = ann->deltas[w];

		for (i = 0; i <= W->inputs; i++) {
			I = ann->layers[w]->neurons[i];
			for (o = 0; o < W->outputs; o++) {
				W->values[i][o] += D->values[i][o] * ann->rate * I->value;
			}
		}
	}

	free(error);
	return ret;
}

int
main()
{
	int i, counter = 0;;
	Ann *test = anncreate(3, 2, 16, 1);
	double inputs[4][2] = { { 1.0, 1.0 }, {1.0, 0.0}, {0.0, 1.0}, {0.0, 0.0}};
	double outputs[4] = { 0.0, 1.0, 1.0, 0.0 };
	double *results;
	double error = 1000;

	while (error >= 0.001) {
		error = 0;
		for (i = 0; i < 4; i++)
			error += anntrain(test, inputs[i], &outputs[i]);	

		counter++;
		if (counter % 100 == 1)
			printf("error: %f\n", error);
	}

	printf("error: %f, done after %d epochs\n", error, counter);

	return 0;
}