#include <iostream>

#include <cudaul/cuda_error.h>
#include <cudaul/initCUDA.h>
#include <cudaul/timer.h>
#include "simple.cuh"

#include "openglmain.h"

int simplecudaexample() {
	try {
		cudaul::initCUDA();
	} catch(std::runtime_error& e) {
		std::cerr << "Failed to initialize CUDA!, Error was:\n\t" << e.what() << std::endl;
		return 1;
	}

	// create memory on host to work on 
	const unsigned int num_elem = 1000000;

	float *mem_a = new float[num_elem];
	float *mem_b = new float[num_elem];
	float *ref_c = new float[num_elem];

	// initialize with random data 
	for(int i=0; i < num_elem; ++i) {
		mem_a[i] = (rand()/(static_cast<float>(RAND_MAX) + 1.0f));
		mem_b[i] = (rand()/(static_cast<float>(RAND_MAX) + 1.0f));
	}

	// run computation on CPU (we will parallelize this loop then with CUDA)
	for(int i=0; i < num_elem; ++i) {
		ref_c[i] = mem_a[i] * mem_b[i]; 
	}

	float *result = NULL;

	// run computation on CUDA 
	for (int i = 0; i < 10; ++i) {
		
		cudaul::cudatimer time("simple_mult"); //cuda based timer to measure GPU execution time (more useful for asynchronous kernel calls)
		time.start();
		result = simple_mult(mem_a, mem_b, num_elem);
		time.stop();
		
		LOG_CUDA_ERRORS();

		// compute error 
		float sum_error = 0;
		for (int i = 0; i < num_elem; ++i) {
			float tmp = ref_c[i] - result[i];
			sum_error += tmp * tmp;
		}

		delete [] result;
		std::cout << "Total squared error: " << sum_error << std::endl;
	}

	// free all host memory 
	delete [] mem_a;
	delete [] mem_b;
	delete [] ref_c;

	system("PAUSE");

	// done! 
	return 0;
}

int main(int argc, const char* argv[]) {

/*----------2.4----------*/
	simplecudaexample();
	//oglexample(argc,argv);
}
