/+ Alpha blended blitting routine. +/

import std.stdio;

version = SDL;
version( SDL )
{
	import derelict.sdl.sdl;
}

// TODO: RGB32?
enum : uint
{
	INVALID = 0,
	RGBA32,
	RGB24,
	RGB16_555,
	RGB16_565,
	RGBA8_I32, // indexed to 32 bit values
	A8,
}

private template readSource( uint RGBA )
{
	static if ( RGBA == RGBA32 )
	{
		uint readS_dummy1 = srgb = source[si];
		uint readS_dummy2 = alpha = srgb & sourceAMask;
		uint readS_dummy3 = srgb = srgb & ~sourceAMask;
	}
	else static if ( RGBA == RGB24 )
	{
		// There is no such thing as an array with 24-bit elements, so we have
		//   to use pointers.  
		uint readS_dummy1 = srgb = *(cast(uint*)(source + si));
	}
	else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 )
	{
		// cast(uint) is not necessary in all cases, only if dest is 32 bpp
		uint readS_dummy1 = srgb = cast(uint)source[si];
	}
	else static if ( RGBA == RGBA8_I32 )
	{
		uint readS_dummy1 = srgb = rgbaTable[source[si]];
		uint readS_dummy2 = alpha = srgb & sourceAMask;
		uint readS_dummy3 = srgb = srgb & ~sourceAMask;
	}
	else static if ( RGBA == A8 )
	{
		uint readS_dummy1 = alpha = cast(uint)source[si];
	}
	else
	{
		pragma(msg,"Invalid source RGBA format for reading.");
		static assert(0);
	}
}

private template readDestination( uint RGBA, ubyte half16bpp = NOT_APPLICABLE )
{
	static if ( RGBA == RGBA32 )
	{
		uint readD_dummy1 = drgb = dest[di];
	}
	else static if ( RGBA == RGB24 )
	{
		// There is no such thing as an array with 24-bit elements, so we have
		//   to use pointers.  
		uint readD_dummy1 = drgb = *(cast(uint*)(dest + di));
		
		// Since we can't write 24 bits, we can either write 3 bytes (slow),
		//   or we can overwrite 8 bits of the next pixel.  The latter is 
		//   faster and can be done safely if we overwrite those 8 bits with 
		//   their previous contents.  
		uint drgbOriginal = drgb;
	}
	else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 )
	{
		uint readD_dummy1 = drgb = dest[di];
		
		static if ( half16bpp == LOW_ADDRESS_HALF || 
				    half16bpp == HIGH_ADDRESS_HALF  )
		{
			// Store the original values of both pixels being read.
			// When reading and writing 2 pixels at a time, it is impossible
			//   to prevent overwriting a pixel that we don't want to.  At 
			//   least not without some rather complicated code.  So instead, 
			//   we just make sure that the pixel we don't want to overwrite 
			//   is overwritten with it's original value.  The original value 
			//   is stored here.  
			uint drgbOriginal = destReadResult;
		}
	}
	else static if ( RGBA == RGBA8_I32 )
	{
		uint readD_dummy1 = drgb = rgbaTable[dest[di]];
	}
	else
	{
		pragma(msg,"Invalid destination RGBA format for reading.");
		static assert(0);
	}
}

private template read( uint sourceRGBA, uint destRGBA, 
                       ubyte half16bpp = NOT_APPLICABLE )
{
	mixin readSource!( sourceRGBA );
	mixin readDestination!( destRGBA, half16bpp );
}

private template convert( uint sourceRGBA, uint destRGBA )
{
	static if ( sourceRGBA == RGBA32 || sourceRGBA == RGB24 || sourceRGBA == RGBA8_I32 )
	{
		static if ( destRGBA == RGBA32 || destRGBA == RGB24 )
		{
			alias sourceReadResult srgb; // do nothing
		}
		static if ( destRGBA == RGB16_565 )
		{
			// Here we must shrink a 32 bit pixel from the source into a
			//   16 bit pixel.
			// in this situation we write the 16 bit resultant pixels one at
			//   a time so the extra 16 bits will be safely discarded.
			uint convert_dummy1 = 
			srgb = ((0xf800 & (sourceReadResult >> 8 )) +
			        (0x07e0 & (sourceReadResult >> 5 )) +
			        (0x001f & (sourceReadResult >> 3 )));
		}
		else static assert(0);
	}
	else static if ( sourceRGBA == RGB16_565 )
	{
		static if ( destRGBA == RGBA32 || destRGBA == RGB24 )
		{
			// Here we must expand a 16 bit pixel from the source into a
			//   32 bit pixel.
			// In this situation we read the 16 bit pixels one at a time
			//   so the extra 16 bits can be safely discarded.
			uint convert_dummy1 = 
			srgb = (((sourceReadResult & 0xf800) << 8 ) +
			        ((sourceReadResult & 0x07e0) << 5 ) +
			        ((sourceReadResult & 0x001f) << 3 ));
		}
		else static if ( destRGBA == RGB16_565 )
		{
			//alias sourceReadResult srgb; // do nothing
		}
		else static assert(0);
	}
	else static if ( sourceRGBA == A8 )
	{
		//alias srcColor srgb;
	}
	else static assert(0);
}

private template blend( uint RGBA )
{
	// Note that this will get it right regardless of which color is in which 
	//   channel.  Of course, the channels' placements must be correct.  
	// It also preserves the destination's alpha channel, if present.  
	static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 || 
	                 RGBA == RGB16_565 || RGBA == RGB16_555 )
	{
		static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 )
		{
			const shift = 8;
			const evenMask = 0x00ff00ff;
		}
		else
		{
			// For 16bpp formats:
			// alpha must be a 5 bit value (the 3 hi bits MUST be clear)
			// this does 2 16bit pixels at a time in one 32 bit word.  
			// endianness doesn't matter on 565 formats due to symmetry
			// TODO: take into account endianness on 555 formats
			//        (probably only noticable on big endian machines)
			const shift = 5;
			const evenMask = 0x07e0f81f;
		}
		const oddMask = ~evenMask;
		
		static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 )
			uint originalDestAlpha = drgb & sourceAMask;
		
		static if ( destbpp == 16 )
			uint blend_dummy1 = alpha = alpha >> 3;
		
		
		static if ( destbpp == 16 && sourceRGBA == A8 )
		{
			// Extract the middle channel and shift it into the high 16 bits, giving
			//   at least 5 bits above it to hold the multiplication overflow, and at
			//   least 5 bits below it to hold the high channel's multiplication overflow.
			uint sourceChannels = ((srgb << 16) | srgb) & evenMask;
			uint destChannels =   ((drgb << 16) | drgb) & evenMask;
			
			// do the blending
			uint blend_temp =
				(((sourceChannels - destChannels) * alpha) >> shift) + destChannels;
			
			// Now we move the middle channel from the high 16 bits, back into its 
			//   rightful place in the middle.  
			uint blend_dummy2 = 
			drgb = (blend_temp & (evenMask & 0x0000ffff)) | 
				  ((blend_temp & (evenMask & 0xffff0000)) >> 16 );
		}
		else
		{
			uint blend_dummy2 =
			drgb = 
				((((((srgb & evenMask)-(drgb & evenMask))  * alpha) >> shift) + drgb) & evenMask) |
				((((((srgb & oddMask )-(drgb & oddMask )) >> shift)  * alpha) + drgb) & oddMask);
		}
		
		static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 ) // preserve alpha
			uint blend_dummy3 = drgb = (drgb & ~sourceAMask) | originalDestAlpha;
	}
	else
	{
		pragma(msg,"Invalid RGBA format for alpha blending.");
		static assert(0);
	}
}


private template write( uint RGBA, ubyte half16bpp = NOT_APPLICABLE )
{
	
	static if ( RGBA == RGBA32 )
	{
		uint write_dummy1 = dest[di] = drgb;
	}
	else static if ( RGBA == RGB24 )
	{
		uint* address = cast(uint*)(dest + di);
		
		version ( BigEndian )
			uint write_dummy1 = *address = (drgb & 0xffffff00) | (drgbOriginal & 0x000000ff);
		else
			uint write_dummy1 = *address = (drgb & 0x00ffffff) | (drgbOriginal & 0xff000000);
	}
	else static if ( RGBA == RGB16_565 || RGBA == RGB16_555 )
	{
		// for selecting the lowest or highest pixel in terms of 
		//   address in memory rather than place in the word/register
		version ( BigEndian )
			const writeMask = 0x0000ffff;
		else
			const writeMask = 0xffff0000;
		
		static if ( half16bpp == HIGH_ADDRESS_HALF )
			uint write_dummy1 = dest[di] = (drgb & writeMask) | (drgbOriginal & ~writeMask);
		else static if ( half16bpp == LOW_ADDRESS_HALF )
			uint write_dummy1 = dest[di] = (drgb & ~writeMask) | (drgbOriginal & writeMask);
		else
			uint write_dummy1 = dest[di] = drgb;
	}
	// TODO:  writing RGBA8_I32.  needs an algo to reverse a 32 bpp value into
	//          the an 8 bit indexed value.  
	else
	{
		pragma(msg,"Invalid RGBA format for alpha blending.");
		static assert(0);
	}
	
	uint write_dummy2 = si = si + sourceIncrement;
	uint write_dummy3 = di = di + destIncrement;
}

private enum : ubyte
{
	NOT_APPLICABLE = 0,
	LOW_ADDRESS_HALF,
	HIGH_ADDRESS_HALF,
}

private template innerLoop( uint sourceRGBA, uint destRGBA, 
                            ubyte half16bpp = NOT_APPLICABLE )
{
	static if ( !(destRGBA == RGB16_565 || destRGBA == RGB16_555) && half16bpp > 0 )
	{
		pragma(msg,"The half16bpp argument is only to be used when the "
		           "destination format is 16 bits per pixel.");
		static assert(0);
	}
	
	mixin read!(sourceRGBA,destRGBA,half16bpp);
	mixin convert!(sourceRGBA,destRGBA);
	mixin blend!(destRGBA);
	mixin write!(destRGBA,half16bpp);
}

private template calculatePaddingAndArrays( bool isSource )
{

	static if ( isSource )
	{
		alias srcSurface surface;
		alias sourcebpp bpp;
		alias destbpp otherbpp;
	}
	else
	{
		alias dstSurface surface;
		alias destbpp bpp;
		alias sourcebpp otherbpp;
	}
	
	// Padding is the amount of extra data at the end of a scanline used to
	//   ensure that the end of the scanline lines up on a 32 bit boundary.
	// spadding = source padding
	// dpadding = dest padding
	// In this case, the units padding is measured in change depending on
	//   the source and destination format.  
	// The amount of data that is handled in each iteration also changes,
	//   and is reflected by the different types of arrays.  
	
	static if ( bpp == 32 )
	{
		auto padding = 0;
		uint[] pixelData = cast(uint[])surface.pixels;
	}
	else static if ( bpp == 24 )
	{
		// padding measured in bytes
		auto padding = surface.pitch - (surface.width * 3);
		ubyte* pixelData = surface.pixels.ptr;
	}
	else static if ( bpp == 16 )
	{
		static if ( otherbpp != 16 /+otherbpp == 32 || otherbpp == 24 || otherbpp == 8+/ )
		{
			// padding measured in shorts
			auto padding = (surface.pitch >> 1) - surface.width;
			ushort[] pixelData = cast(ushort[])surface.pixels;
		}
		else
		{
			auto padding = 0;
			uint[] pixelData = cast(uint[])surface.pixels;
		}
	}
	else static if ( bpp == 8 )
	{
		auto padding = surface.pitch - surface.width; // padding measured in bytes
		ubyte[] pixelData = surface.pixels;
	}
	else
		static assert(0);
	
	static if ( isSource )
	{
		alias padding spadding;
		alias pixelData source;
	}
	else
	{
		alias padding dpadding;
		alias pixelData dest;
	}
}

// This function shall do no clipping.  

void blit( uint sourceRGBA, uint destRGBA )
		( short sourceX, short sourceY, 
		short destX, short destY, short width, short height, 
		inout Surface srcSurface, inout Surface dstSurface, 
		uint srcColor, uint alpha )
{
	// this stuff just determines the bits per pixel of the source and 
	//   destination surfaces
	static if ( sourceRGBA == RGBA32 )
		const sourcebpp = 32;
	else static if ( sourceRGBA == RGB24 )
		const sourcebpp = 24;
	else static if ( sourceRGBA == RGB16_565 || sourceRGBA == RGB16_555 )
		const sourcebpp = 16;
	else
		const sourcebpp = 8;
	
	static if ( destRGBA == RGBA32 )
		const destbpp = 32;
	else static if ( destRGBA == RGB24 )
		const destbpp = 24;
	else static if ( destRGBA == RGB16_565 || destRGBA == RGB16_555 )
		const destbpp = 16;
	else
		const destbpp = 8;
	//
	
	static if ( (sourcebpp == 32 || sourcebpp == 24) && destbpp == 16 )
		const convert32to16 = true;
	else
		const convert32to16 = false;
	
	static if ( sourcebpp == 16 && (destbpp == 32 || destbpp == 24) )
		const convert16to32 = true;
	else
		const convert16to32 = false;
	
	static if ( (destRGBA == RGB16_565 || destRGBA == RGB16_555) && sourceRGBA != A8 )
		srcColor |= (srcColor << 16);
	
	static if ( destbpp == 16 )
		alpha >>= 3;
	
	// note that the padding quantities are necessarily zero if
	//   unitWidth = width / 2;
	//   that's important because they have different units of measurement!
	
	mixin calculatePaddingAndArrays!( true );
	mixin calculatePaddingAndArrays!( false );
	
	static if ( destbpp == 24 )
	{
		uint lineWidth = width * 3;
		
		static if ( sourcebpp == 24 )
		{
			// same as: unitSrcSurfaceWidth = srcSurface.width * 3;
			uint unitSrcSurfaceWidth = srcSurface.pitch - spadding;
			uint unitSrcWidth = lineWidth;
		}
		else
		{
			uint unitSrcSurfaceWidth = srcSurface.width;
			uint unitSrcWidth = width;
		}
		
		uint unitDstSurfaceWidth = dstSurface.pitch - dpadding;
		
		uint unitDstWidth = lineWidth;
	}
	else static if ( sourcebpp == 16 && destbpp == 16 )
	{
		uint lineWidth = width / 2; // because we do 2 pixels at a time
		
		// The +(width & 1) part is used to make the division round up.  
		uint unitSrcSurfaceWidth = (srcSurface.width / 2) + (srcSurface.width & 1);
		uint unitDstSurfaceWidth = (dstSurface.width / 2) + (dstSurface.width & 1);
		
		// The lineWidth variable rounds down on division, so it may be
		//   missing a pixel.  That is desirable since we don't want alphablend
		//   onto the pixel next to the missing pixel.  Of course, we will 
		//   handle the missing pixel individually, but it is still useful to 
		//   have access to a rounded-up version of the blit's width.  
		uint unitSrcWidth = lineWidth + (width & 1);
		uint unitDstWidth = unitSrcWidth;
	}
	else
	{
		uint lineWidth = width;
		uint unitSrcWidth = width;
		uint unitDstWidth = width;
		uint unitSrcSurfaceWidth = srcSurface.width;
		uint unitDstSurfaceWidth = dstSurface.width;
	}
	
	uint sourceAMask = srcSurface.alphaMask;
	
	version( SDL )
	{
		auto sourceSdlSurface = srcSurface.sdl_surface;
		if ( sourceSdlSurface !is null )
		{
			bool srcLocked = lock( sourceSdlSurface );
			scope(exit)
			{
				if ( srcLocked )
					SDL_UnlockSurface( sourceSdlSurface );
			}
		}
		
		auto destSdlSurface = srcSurface.sdl_surface;
		if ( destSdlSurface !is null )
		{
			bool dstLocked = lock( destSdlSurface );
			scope(exit)
			{
				if ( dstLocked )
					SDL_UnlockSurface( destSdlSurface );
			}
		}
	}
	
	static if ( sourcebpp == 24 )
		uint sourceIncrement = 3;
	else
		uint sourceIncrement = 1;
	
	static if ( destbpp == 24 )
		uint destIncrement = 3;
	else
		uint destIncrement = 1;
	
	// Since we are not necessarily blitting accross the entire width of the 
	//   destination surface or source surface, we have to skip some of the
	//   pixels on the end of the current scanline and on the beginning of
	//   the next scanline.  
	// Add that to the padding (which is explained above), and the result
	//   is these source/dest LineExtra variables.  
	int sLineExtra = unitSrcSurfaceWidth + spadding - unitSrcWidth;
	int dLineExtra = unitDstSurfaceWidth + dpadding - unitDstWidth;
	
	// initialize the index variables
	// si = source index
	// di = destination index
	int si = (sourceX * sourceIncrement) + (unitSrcSurfaceWidth * sourceY);
	int di = (destX   * destIncrement)   + (unitDstSurfaceWidth * destY);
	
	// nextLine is always ahead of di by the amount of pixels left in one line
	//   of the blit.  
	int nextLine;
	
	// endi is the index to stop at.  
	//int endi = destX + unitWidth + (unitDstSurfaceWidth + dpadding) * (destY + height);
	int endi = di + (unitDstSurfaceWidth * height);
	
	assert( lineWidth + dLineExtra == unitDstSurfaceWidth + dpadding );
	
	// TODO: remove this
	void writeHex ( char[] name, uint number )
	{
		writef( "(",name,std.string.toString( cast(ulong)number, cast(uint)16 ),")|" );
	}
	//
	
	uint srgb;
	uint drgb;
	
	static if ( sourceRGBA == A8 )
		srgb = srcColor;
	
	while( di < endi )
	{
		nextLine = di + lineWidth;
		
		static if ( convert32to16 || convert16to32 )
			mixin innerLoop!(sourceRGBA,destRGBA,HIGH_ADDRESS_HALF);
		
		while( di < nextLine )
		{
			mixin innerLoop!(sourceRGBA,destRGBA);
		}
		
		static if ( convert32to16 || convert16to32 )
			mixin innerLoop!(sourceRGBA,destRGBA,LOW_ADDRESS_HALF);
		
		si += sLineExtra;
		di += dLineExtra;
	}
}

version( SDL )
{
	
	private bool lock( SDL_Surface* surface )
	{
		if ( SDL_MUSTLOCK( surface ) && !surface.locked )
		{
			safe_SDL_LockSurface( surface );
			return true;
		}
		return false;
	}
	
	// automatically throw errors resulting from the SDL_LockSurface function.
	private void safe_SDL_LockSurface( SDL_Surface* surface )
	{
		if ( SDL_LockSurface( surface ) != 0 )
		{
			char* sdlError = SDL_GetError();
			char[] error = sdlError[0..std.c.string.strlen(sdlError)];
			throw new Exception( "SDL_LockSurface failed to lock a surface: "~error );
		}
	}
}

struct Surface
{
	ubyte[] pixels;
	uint alphaMask = 0;
	ushort width = 0xffff;
	ushort height = 0xffff;
	ushort pitch = 0xffff; /// width of a scanline in bytes.  
	ushort RGBAformat = INVALID;
	
	/// width and height are in pixels.
	static Surface opCall( ubyte[] pixels, uint alphaMask,
	                       ushort width, ushort height, ushort pitch,
	                       ushort RGBAformat )
	{
		Surface result;
		assert( pixels !is null );
		result.pixels = pixels;
		result.width = width;
		result.height = height;
		result.pitch = pitch;
		result.RGBAformat = RGBAformat;
		result.alphaMask = alphaMask;
		return result;
	}
	
	version ( SDL )
	{
		SDL_Surface* sdl_surface = null;
		
		static Surface opCall( SDL_Surface* surface, ushort RGBAformat )
		{
			Surface result;
			result.pixels = cast(ubyte[])surface.pixels[0.. surface.pitch * surface.h];
			result.width = surface.w;
			result.height = surface.h;
			result.pitch = surface.pitch;
			assert ( RGBAformat != INVALID );
			result.RGBAformat = RGBAformat;
			result.alphaMask = surface.format.Amask;
			result.sdl_surface = surface;
			return result;
		}
	}
}