
#define SCR_W	(80*3)
#define SCR_H	(50*3)
#define SCR_C	4

#define TEX_COL	0    //reflection (should there be modes? like add/sub/alpha?)
#define TEX_REF	1    //diffusion (specularness, one channel)
#define TEX_LUM	2    //color
#define TEX_TRS 3	//transparency (should there be modes? like add/sub/alpha?)
#define MAX_TEX 4	//max texture per object

#define TEXMAX	5	//max texture for the whole scene/system/program
					//to add to functions definitions
#define TEX_ARGS	read_only image2d_t tex0,\
					read_only image2d_t tex1,\
					read_only image2d_t tex2,\
					read_only image2d_t tex3,\
					read_only image2d_t tex4
					//to add to functions calls
#define TEX_PARM	tex0,\
					tex1,\
					tex2,\
					tex3,\
					tex4

#define RAY_MAX 32				//rays add overhead even when unused, more means more depth but less speed
#define STOC_SAMPLE	128			//maximum ray splitting factor, less is noisier but gves more depth
//define the minimal contribution so that below this one we just skip or merge the pixels
#define MIN_CONTRIB (1.f/8)		//lower = less depth but less noise

//using a baked random doesn'T seem to make much of a difference
#define BAKED_RANDOM

#define sqrtf	native_sqrt
#define sinf	sin
#define cosf	cos

const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_REPEAT | CLK_FILTER_LINEAR;

typedef struct{
	float3	v;		//vertices
	float3	n;		//normals
	float2	t;		//texture coordinates
	float2	pad;	//only for alignmennt
}vert_t;

typedef struct{
	uint3	v;		//3 verts
	float3	n;		//normals
}poly_t;
/*
typedef struct{
	int		vnum;			//number of vertices
	int		pnum;			//number of polys
	int		fnum;			//number of frames
	vert_t*	vert;			//the vertices
	poly_t*	poly;			//the polys
	float	x, y, z;		//position
	float	h, p, b;		//rotation
	int		tex[MAX_TEX];	//texture index	really useful ? per poly or per object ?
	int		frame;			//current frame
	float	ff;				//float frame for slow animations
	vert_t	bbox[8];		//8 points of its bounding box
	float	bx, by, bz, br;	//bounding sphere position + radius
}obj_t;
*/
typedef struct{
	float	x, y, z;		//position
	float	h, p, b;		//rotation
	float	dst;			//distace from cam to plane (fov)
}cam_t;

//baked stuff
typedef struct{
	int vOffs;
	int pOffs;
	int vnum;
	int pnum;
	int tex[MAX_TEX];
	//add bbox here
	//add bsphere here
}bObj_t;

typedef struct{
	vert_t v[2048];		//12*32 (3+3+2)
	poly_t p[2048];		//8*32  (3+3)
	bObj_t o[64];		//size: 8x32
	float sky[4][3];	//sky gradient color, RGB
	float3 cam[4];		//pos, left, up, forward (x,y,z)
	int objNum;
	int vnum;
	int pnum;
	//	int lightNum;
	//	light_t* l;
}scene_t;

typedef struct{
	float3 org;				//origin
	float3 dir;				//direction
	float3 contrib;			//we mul by this one before adding it to the current pixel
}ray_t;

typedef struct{
	ray_t ray[RAY_MAX];				//the rays to process
	int ray_cnt;					//total number of rays to be proceesed
	int ray_cur;					//ray currently being processed
	__global scene_t* s;	//ref to te scene to be rendered
}rayList_t;

//----------------------------------------------------------------------------//
float3 reflect(float3 n, float3 v)
{
	return v - 2 * dot(v, n)*n;
}
//----------------------------------------------------------------------------//
#ifndef BAKED_RANDOM
	typedef int rnd_t;
	#define srand(A) int _seed=A; int* seed=&_seed;
	unsigned short int _rand(unsigned int *seed)
	{
	//  *seed = *seed * 0x343fd + 0x269ec3;
		*seed = (*seed * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1);
		return *seed >> 16;
	}
	float _randf(unsigned int* seed)
	{
		return ((_rand(seed)&0x1FFF)/(float)0xFFF)-1.f;
	}
	float _randuf(unsigned int* seed)
	{
		return (_rand(seed) & 0xFFF) / (float)(0xFFF);
	}
	float3 _randf3(unsigned int* seed)
	{
		return (float3)(_randf(seed), _randf(seed), _randf(seed));
	}
#else
	typedef struct{
		__global float4* rdata;
		int seed;
	}rnd_t;
	#define RAND_MAX (256)
	#define srand(A) rnd_t _seed={randomData,A}; rnd_t* seed = &_seed;
	float3 _randf3(rnd_t* r)
	{
		r->seed++;
		r->seed &= RAND_MAX - 1;
		return r->rdata[r->seed].xyz;
	}
	float _randuf(rnd_t* r)
	{
		r->seed++;
		r->seed &= RAND_MAX - 1;
		return r->rdata[r->seed].w;
	}
	float _randf(rnd_t* r)
	{
		return _randuf(r) * 2.f - 1.f;
	}
	int _rand(rnd_t* r)
	{
		return (int)(_randf(r) * 0xFFFF);
	}
#endif
	#define rand()		_rand(seed)		//fetch an unsigned int between 0 and 0xFFFF
	#define randf()		_randf(seed)	//fetch a signed float between -1 and 1
	#define randuf()	_randuf(seed)	//fetch a signed float between 0 and 1
	#define randf3()	_randf3(seed)	//fetch a signed float3 between -1 and 1
//----------------------------------------------------------------------------//
int triangle_intersection(
	const float3	V1,  // Triangle vertices
	const float3	V2,
	const float3	V3,
	const float3	O,	//Ray origin
	const float3	D,	//Ray direction
	float* out)			//returns distance from ray's origin
{
	const float EPSILON = 0.000001;
	float3 e1, e2;  //Edge1, Edge2
	float3 P, Q, T;
	float det, inv_det, u, v;
	float t;

	//Find vectors for two edges sharing V1
	e1 = V2 - V1;
	e2 = V3 - V1;
	//Begin calculating determinant - also used to calculate u parameter
	P = cross(D, e2);
	//if determinant is near zero, ray lies in plane of triangle or ray is parallel to plane of triangle
	det = dot(e1, P);
	//NOT CULLING
	if (det > -EPSILON && det < EPSILON) return false;
	inv_det = 1.f / det;

	//calculate distance from V1 to ray origin
	T = O - V1;

	//Calculate u parameter and test bound
	u = dot(T, P) * inv_det;
	//The intersection lies outside of the triangle
	if (u < 0.f || u > 1.f) return false;

	//Prepare to test v parameter
	Q = cross(T, e1);

	//Calculate V parameter and test bound
	v = dot(D, Q) * inv_det;
	//The intersection lies outside of the triangle
	if (v < 0.f || u + v  > 1.f) return false;

	t = dot(e2, Q) * inv_det;

	if (t > EPSILON) { //ray intersection
		*out = t;
		return true;
	}

	// No hit, no win
	return false;
}
//----------------------------------------------------------------------------//

inline float4 getTexCol(int texID, float2 uv, TEX_ARGS)
{
	switch (texID)
	{
	case 0: return read_imagef(tex0, sampler, uv);
	case 1: return read_imagef(tex1, sampler, uv);
	case 2: return read_imagef(tex2, sampler, uv);
	case 3: return read_imagef(tex3, sampler, uv);
	case 4: return read_imagef(tex4, sampler, uv);
	default:
			//treat tex ID (int) as [A]RGB (byte)
			return convert_float4( *((uchar4*)&texID) ) / 255.f;
	}
}
//----------------------------------------------------------------------------//
#define MAX_DIST (999999.f)
float collide(float3 org, float3 dir, const __global scene_t* s, int* out_oID, int *out_poly)
{
	float min_dst = MAX_DIST;
	int min_o = -1;
	int min_p;
	int o, p;

	dir = normalize(dir);

	for (o = 0; o < s->objNum; o++)
	{
		int poff = s->o[o].pOffs;
		int voff = s->o[o].vOffs;
		for (p = 0; p < s->o[o].pnum; p++)
		{
			//check for normal direction (skip if going same dir, would need to flip for transparency)
			float3 normal = s->p[poff + p].n;
			if (dot(dir, normal) < 0)continue;

			//setup a triangle
			uint3 vi = s->p[poff + p].v + voff;
			float3 v[3] = {
				s->v[vi.s0].v,
				s->v[vi.s1].v,
				s->v[vi.s2].v,
			};

			//check for collision
			float dst;
			if (triangle_intersection(v[0], v[1], v[2], org, dir, &dst))
				if (dst>0.001f && dst<min_dst){
					min_dst = dst;
					min_o = o;
					min_p = poff + p;
				}
		}
	}
	*out_oID = min_o;
	*out_poly = min_p;
	return min_dst;
}
//----------------------------------------------------------------------------//
//returns the color to be contributed to the pixel
float3 traceRay(rayList_t* rl, rnd_t *seed, TEX_ARGS)
{
			float3	color		= (float3)(0.f);
	const	ray_t*	ray			= &rl->ray[rl->ray_cur];
	const	float3	contrib		= ray->contrib;
	const	float3	dir			= normalize(ray->dir);
	const __global scene_t* s	= rl->s;

	//check for collision
	int min_o, min_p;
	float min_dst = collide(ray->org, dir, s, &min_o, &min_p);

	//return the color
	if (min_dst < MAX_DIST)
	{
		//according to bObj o and poly p and collision point c;
		//collision point = distance * direction

		float3 hit;
		float3 n;		//local normal
		float2 uv;		//texture/uv coords
		float4 tc;		//texture color

		hit = ray->org + dir * min_dst;

		//find the UV coords and compute the new normal
		{
			float3 f1, f2, f3;
			float3 ab;
			float3 ac;
			float a, a1, a2, a3;

			//just shorthands to the 3 vertices making up the polygon
			const __global vert_t* __local v0;
			const __global vert_t* __local v1;
			const __global vert_t* __local v2;
			
			v0 = &s->v[s->o[min_o].vOffs + s->p[min_p].v.s0];
			v1 = &s->v[s->o[min_o].vOffs + s->p[min_p].v.s1];
			v2 = &s->v[s->o[min_o].vOffs + s->p[min_p].v.s2];

			// calculate vectors from hit point to vertices
			f1 = v0->v - hit;
			f2 = v1->v - hit;
			f3 = v2->v - hit;

			// calculate the areas and factors (order of parameters doesn't matter):
			ab = v0->v - v1->v;
			ac = v0->v - v2->v;

			a  = length(cross(ab, ac));          	// main triangle area a
			a1 = length(cross(f2, f3)) / a;       	// p1's triangle area / a
			a2 = length(cross(f3, f1)) / a;       	// p1's triangle area / a
			a3 = length(cross(f1, f2)) / a;       	// p1's triangle area / a

			n = (float3)(a1, a2, a3);
			// find the uv corresponding to point f (uv1/uv2/uv3 are associated to p1/p2/p3):
			n = v0->n * a1 +
				v1->n * a2 +
				v2->n * a3;

			//same for UV
			uv =	v0->t * a1 +
					v1->t * a2 +
					v2->t * a3;
		}//	</ UV & normal >

		//patching invalid reflection normals with valid ones
		//it ugly but it works better than nothing
		//no simpler heuristic has been found to work		
		{
			float3	realN,
					virtN,
					virtRef;
			
			realN = s->p[min_p].n;
			virtN = n;
			virtRef = reflect(n, dir);
//			virtRef = dir - 2 * dot(n, dir)*n;
			
			if (dot(virtRef, realN) > 0)
			{
				n = realN;
			}
		}
		float3 ref = reflect(n, dir); 


		//texture mapping REFLECTION (acts as an RGB mask)
		float4 refColor = getTexCol(s->o[min_o].tex[TEX_REF], uv, TEX_PARM);

		float totalContrib = dot(refColor.xyz*contrib, (float3)(.33f));

		if (totalContrib > MIN_CONTRIB &&		//is it worth computing relfection?
			rl->ray_cnt < RAY_MAX)				//can we?
		{
			//check for sharpness
			float sharpness = sqrtf(refColor.w);

			//amount of rays is relative to diffuseness AND total contrib
			int samples = totalContrib * STOC_SAMPLE * (1 - sharpness) + 1;
			//don't split uselessly
			if (totalContrib / samples < MIN_CONTRIB) samples = (totalContrib / MIN_CONTRIB);
			//make sure it's valid
			samples = clamp(samples, 1, STOC_SAMPLE);
			samples = min(samples, RAY_MAX - rl->ray_cnt);

			sharpness *= .5f;	//never 100% diffuse, max 50%
			sharpness += .5f;

			for (int i = 0; i < samples; i++)
			{
				float3 difRef;
				if (sharpness < .99f){
					float3 rn = randf3();

					if (dot(n, rn) < 0)rn *= -1;	//hemisphere toward the normal

					rn = normalize(rn);

					difRef = mix(rn, ref, sharpness);
				}
				else
					difRef = ref;

				rl->ray[rl->ray_cnt].org = hit;
				rl->ray[rl->ray_cnt].dir = difRef;
				rl->ray[rl->ray_cnt].contrib = refColor.xyz*contrib / samples;	//divide the contribution
				rl->ray_cnt++;
			}
		}
		else
		{	//fake ref with some grey as default color
			color += refColor.xyz * .5f;
		}

		//texture mapping SELF_ILLUM
		tc = getTexCol(s->o[min_o].tex[TEX_LUM], uv, TEX_PARM);
		color += tc.xyz;

		//texture mapping COLOR
		float3 light = clamp(dot(n,dir),0.f,1.f);		//just normal facing ray or not. More like black fresnel effect actually
		tc = getTexCol(s->o[min_o].tex[TEX_COL], uv, TEX_PARM);
		color += tc.xyz*light * (1.f - refColor.xyz);	//remove the relfection part of the light, for physical accuracy

		return color*contrib;

		//texture mapping TRANSPARENCY
		//TODO: launch a ray, or bunch of, and tint and mix accordingly
		// or we could instead say 50% transp, 50% chance the ray continues ... but then it's harder to tint and stuff
		//getTexCol(s->o[min_o].tex[TEX_TRS],uv,tc);

	}else
	{
		//if no collision return sky gradient :)
		if ((dir).y>0){
			float a = dir.y, b = 1 - dir.y;
			color = (float3)(	s->sky[3][0] * a + s->sky[2][0] * b,
								s->sky[3][1] * a + s->sky[2][1] * b,
								s->sky[3][2] * a + s->sky[2][2] * b);
		}
		else{
			float a = -dir.y, b = 1 + dir.y;
			color = (float3)(	s->sky[0][0] * a + s->sky[1][0] * b,
								s->sky[0][1] * a + s->sky[1][1] * b,
								s->sky[0][2] * a + s->sky[1][2] * b);
			if (s->sky[2][1] != s->sky[1][1])	//ugly hack: if there's a floor, then we checkboard it!
			{
				float mask = clamp(sinf(dir.x / dir.y)*sinf(dir.z / dir.y)*32.f,-.25f,.5f) + .5;
				color *= mask;
			}
		}
		return color*contrib;
	}
}

__kernel void draw(	__global unsigned char* scr, 
					__global float4* randomData, 
					unsigned int randSeed, 
					__global scene_t *scene, 
					TEX_ARGS
					)
{
	const int	x = get_global_id(0),
				y = get_global_id(1);
	
	float3 org = scene->cam[0];
	float3 dir = (float3)(x - (SCR_W / 2), -y + (SCR_H / 2), 100);
	//adapt to cam
	dir =	dir.x*scene->cam[1] + 
			dir.y*scene->cam[2] + 
			dir.z*scene->cam[3];
	float3 col = (float3)(0);

	//randomize with what we've got
	randSeed += (int)(sinf(sinf(y*1.1f)*15.f + sinf((float)x*.06f) + x) * 1024);
	srand(randSeed);

	//LOOP? could it be worth splitting at a higher level to benefit from more CUDA cores?
	//		actually no. right now we have 4000 * 9 pixels to render, giving 93 pixels per
	//		CUDA core on my GT 740m. 
	//		Best hardware ATM is a GF Titan Xp and has 3840 CUDA cores. Which still gives
	//		9 pixels per core. It may seem low, but at 8x sampling, it gives 72 pixels.
	{
		rayList_t rl = { 
			.ray[0] = { org, dir, (float3)(1.f) }, 
			.ray_cnt = 1, 
			.ray_cur = 0, 
			.s = scene 
		};

		while (rl.ray_cur < rl.ray_cnt)
		{
			col += traceRay(&rl, seed, TEX_PARM);
			rl.ray_cur++;
		}
	}
	col = clamp(col, 0.f, 1.f);
	col = sqrt(col);
	col*= 255;
	
	scr[(y*SCR_W + x)*SCR_C + 0] = col.x;
	scr[(y*SCR_W + x)*SCR_C + 1] = col.y;
	scr[(y*SCR_W + x)*SCR_C + 2] = col.z;
}

