#include "simple.cuh"
#include <cuda_runtime_api.h>
#include <cudaul/timer.h>

/* Device Kernel (__global__ ...) */
/*----------1.1----------*/

/* Host Code */
float* simple_mult(const float *a, const float *b, unsigned int num_elem) {
	/* device memory pointers */
	float *d_a = NULL;
	float *d_b = NULL;
	float *d_result = NULL;

	/* allocate memory on device (cudaMalloc) and copy input to device (cudaMemcpy)*/
/*----------1.2----------*/

	/* calculate launch parameters (block and grid size) and launch kernel*/
	unsigned int num_threads = 128;
/*----------1.3----------*/

	/* download results (cudaMemcpy)*/
	float *result = new float[num_elem];
/*----------1.4----------*/

	/* free memory on device */
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_result);

	/* done */
	return result;
}
