#include <iostream>
#include <stdexcept>
#include <memory>

#include "openglmain.h"

#include <cudaul/cuda_error.h>
#include <cudaul/initCUDA.h>
#include <cudaul/timer.h>

#include <GLee.h>
#include <GL/gl.h>		// OpenGL header
#include <GL/glu.h>		// OpenGL Utility header
#include <GL/glut.h>	// GLUT header

#include <cuda_runtime_api.h>
#include <cuda_gl_interop.h>

#include <oogl/gl_error.h>
#include <oogl/Image.h>

#include "oglex.cuh"

unsigned frameCounter = 0;
GLuint pbo_input = -1;
GLuint pbo_output = -1;
GLuint textureId = -1;
int textureWidth, textureHeight;

void cleanup();

GLuint create_pbo(unsigned size, void *data) {
	GLuint id = 0;
	glGenBuffers(1, &id);
	glBindBuffer(GL_ARRAY_BUFFER, id);
	glBufferData(GL_ARRAY_BUFFER, size, data, GL_DYNAMIC_DRAW);
	glBindBuffer(GL_ARRAY_BUFFER, 0);
	LOG_GL_ERRORS();

	//for use with CUDA, we need to register it as a CUDA buffer!
	cudaGLRegisterBufferObject(id);
	LOG_CUDA_ERRORS();
	return id;
}

bool init() {

	std::auto_ptr<oogl::Image> image(oogl::loadImage("models/lena_std.jpg"));
	if(image->getFormat()!=GL_RGB) {
		std::cerr << "can only work with rgb images" << std::endl;
		return false;
	}

	int image_size = image->getWidth()*image->getHeight()*image->getBytesPerPixel();

	pbo_input = create_pbo(image_size, image->getData());
	if(!pbo_input) {
		std::cerr << "could not create PBO!" << std::endl;
		return false;
	}
	pbo_output = create_pbo(image_size, NULL);
	if(!pbo_output) {
		std::cerr << "could not create PBO!" << std::endl;
		return false;
	}

	//create a texture to render the pbo
	glGenTextures(1, &textureId);
	glBindTexture(GL_TEXTURE_2D, textureId);
	textureWidth = image->getWidth();
	textureHeight = image->getHeight();
	glTexImage2D(GL_TEXTURE_2D, 0, 3, textureWidth, textureHeight, 0, GL_RGB, GL_UNSIGNED_BYTE, NULL);

	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);

	glBindTexture(GL_TEXTURE_2D, GL_NONE);

	LOG_GL_ERRORS();

	//always(!) need GL_TEXTURE_2D
	glEnable(GL_TEXTURE_2D);

	return true;
}

void delete_pbo(GLuint pbo) {
	/* unregister from CUDA usage first */
	cudaGLUnregisterBufferObject(pbo);
	LOG_CUDA_ERRORS();

	/* then, delete from the GL */
	glDeleteBuffers(1, &pbo);
	LOG_GL_ERRORS();
}

void cleanup() {
	if(textureId >= 0) //cleanup the texture
		glDeleteTextures(1, &textureId);

	if(pbo_input >= 0)
		delete_pbo(pbo_input);

	if(pbo_output >= 0)
		delete_pbo(pbo_output);
}

void render_textured_quad() {
	glPushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);

	glEnableClientState(GL_VERTEX_ARRAY);
	glEnableClientState(GL_TEXTURE_COORD_ARRAY);

	const GLfloat c_vertices[]  = {-1,-1, +1,-1, +1,+1, -1,+1};
	/* Not flipped? */
	const GLfloat c_texcoords[] = {+0,+0, +1,+0, +1,+1, +0,+1};
	/* Flipped? */
	//const GLfloat c_texcoords[] = {+0,+1, +1,+1, +1,+0, +0,+0};

	glVertexPointer(2, GL_FLOAT, 0, c_vertices);
	glTexCoordPointer(2, GL_FLOAT, 0, c_texcoords);
	glDrawArrays(GL_QUADS, 0, 4);

	glPopClientAttrib();
}


void display() {
	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

	/* drag PBOs into CUDA and process*/
	{
		unsigned char *input_d = NULL;
		unsigned char *output_d = NULL;
/*----------2.1----------*/
		LOG_CUDA_ERRORS();

		//Note: Adding this timer will slow down execution because of the additional synchronization to cuda!
		cudaul::cudatimer time("Processing Time");
		time.start();
		process_image(output_d, input_d, textureWidth, textureHeight);
		time.stop();

/*----------2.2----------*/
		LOG_CUDA_ERRORS();
	}
	/* fill display texture with contents of PBO */

	glBindTexture(GL_TEXTURE_2D,textureId);

	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo_output);
	//copy the pbo into the existing texture instead of creating a new one with glTexImage2D
	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, textureWidth, textureHeight, GL_RGB, GL_UNSIGNED_BYTE, NULL);
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER, GL_NONE);

	render_textured_quad();

	glBindTexture(GL_TEXTURE_2D,GL_NONE);

	glutSwapBuffers();

	frameCounter++;
	LOG_GL_ERRORS();
}


void reshape(int w, int h) {
	glViewport(0, 0, w, h);

	glMatrixMode(GL_PROJECTION);
	glLoadIdentity();
	glMatrixMode(GL_MODELVIEW);
	glLoadIdentity();
}


void idle() {
	static int last_time = 0;

	int time = glutGet(GLUT_ELAPSED_TIME);
	int time_passed = time-last_time;

	if(time_passed > 1000) { /* every second */
		float fps = frameCounter / static_cast<float>(time_passed/1000.);
		printf("%.2f fps\n", fps);

		/* reset to new start time */
		last_time = time;
		frameCounter = 0;
	}

	glutPostRedisplay();
}

void keyboard(unsigned char key, int x, int y) {
	switch (key) {
	case 27: //27=esc
		cleanup();
		exit(0);
		break;
	}
}

int setupGLUT(int argc, char** argv) {
	glutInit(&argc, argv);
	// glutInitContextVersion(3, 0);
	glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);
	glutInitWindowSize(800, 600);
	glutInitWindowPosition(100, 100);

	int windowId = glutCreateWindow("ex7cAdvancedShaderUsingCUDA");

	glutDisplayFunc(display);
	glutReshapeFunc(reshape);
	glutIdleFunc(idle);
	glutKeyboardFunc(keyboard);

	return windowId;
}

int oglexample(int argc, const char* argv[]) {
	try {
		cudaul::initCUDAWithOpenGL();
	} catch(std::runtime_error& e) {
		std::cerr << "Failed to initialize CUDA!, Error was:\n\t" << e.what() << std::endl;
		return 1;
	}

	/* Init OpenGL + GLUT (AFTER!) CUDA */
	setupGLUT(argc, (char**)argv);

	if(!init())
		return -1;

	glutMainLoop();

	return 0;
}
