#include "oglex.cuh"
#include <iostream>
#include <cuda_runtime_api.h>

inline __device__ __host__ int clamp(int f, int a, int b)
{
    return max(a, min(f, b));
}

/* Device Code */
const int FILTER_RADIUS = 5;
const int BLOCK_WIDTH = 16;
const int BLOCK_HEIGHT = 16;
const int EXT_BLOCK_WIDTH = BLOCK_WIDTH + 2*FILTER_RADIUS;
const int EXT_BLOCK_HEIGHT = BLOCK_HEIGHT+ 2*FILTER_RADIUS;

const int BLOCK_AREA = BLOCK_WIDTH*BLOCK_HEIGHT;
const int FILTER_AREA = (FILTER_RADIUS*2+1)*(FILTER_RADIUS*2+1);
const int EXT_BLOCK_AREA = EXT_BLOCK_WIDTH*EXT_BLOCK_HEIGHT;

/* Simple kernel to convert to greyscale*/
__global__
void kernel_process_nosharedmem(uchar3* dst, const uchar3* src, unsigned int width, unsigned int height) {
	//global x and y coordinate in image
	int x = blockIdx.x*blockDim.x + threadIdx.x;
	int y = blockIdx.y*blockDim.y + threadIdx.y;

	if(x < width && y < height) {
		//index of the current pixel in global memory
		int index = y*width + x;

		/*----------2.3----------*/

		//compute greyscale value from r,g,b of input image
		unsigned char grey = src[index].x * 0.3f + src[index].y * 0.59f + src[index].z * 0.11f;
		//assign value to output image
		dst[index].x = grey;
		dst[index].y = grey;
		dst[index].z = grey;

	}
}

/* Shared memory example*/
__global__
void kernel_process_sharedmem(uchar3* dst, const uchar3* src, unsigned int width, unsigned int height) {
	//allocate shared memory
	__shared__ uchar3 sharedMem[EXT_BLOCK_AREA];

	//global x and y coordinate of the first pixel of the block
	int bx = blockIdx.x*blockDim.x - FILTER_RADIUS;
	int by = blockIdx.y*blockDim.y - FILTER_RADIUS;

	//initialize copy index with current thread number
	int copyIndex = threadIdx.y*blockDim.x+threadIdx.x;

	//copy data
	while(copyIndex < EXT_BLOCK_AREA)
	{
		//compute global x,y to copy based on current index in extended block
		int cx = clamp(bx+copyIndex%EXT_BLOCK_WIDTH,0,width-1);
		int cy = clamp(by+copyIndex/EXT_BLOCK_WIDTH,0,height-1);
		//copy memory
		sharedMem[copyIndex] = src[cy*width+cx];
		//increase copy index by number of threads per block
		copyIndex += BLOCK_AREA;
	}

	//wait until all threads are finished with copying
	__syncthreads();

	//global x and y coordinate in image
	int x = blockIdx.x*blockDim.x + threadIdx.x;
	int y = blockIdx.y*blockDim.y + threadIdx.y;

	if(x < width && y < height) {
		//index of the current pixel in global memory
		int index = y*width + x;

		//coordinate of the current pixel in shared memory (extended block)
		int sx = threadIdx.x+FILTER_RADIUS;
		int sy = threadIdx.y+FILTER_RADIUS;

		/*----------3.1----------*/
	}
}

/* Host Code */
void process_image(unsigned char* output, const unsigned char* input, unsigned int width, unsigned int height) {
	/* in cuda, we have uchar3 for 3-component (e.g.: r,g,b) unsigned char data */
	uchar3 *output_d = (uchar3*)(output);
	const uchar3 *input_d = (const uchar3*)(input);

	/* set up launch parameters */
	dim3 dim_block(BLOCK_WIDTH, BLOCK_HEIGHT); //!!!always check x*y <= 512 (higher for compute capability >=2)
	dim3 dim_grid((width+dim_block.x-1)/dim_block.x, (height+dim_block.y-1)/dim_block.y);

	/*----------3.2----------*/
	kernel_process_nosharedmem<<<dim_grid, dim_block>>>(output_d, input_d, width, height);
}

