/** 
 * Identify the characteristics of the host CPU.
* 
  This code relies on information found in:
	
  - "AMD CPUID Specification", Advanced Micro Devices, revision 2.28 (April 2008)
  - "Intel(R) 64 and IA-32 Architectures Software Developers Manual,
	  Volume 2A: Instruction Set Reference, A-M" (Nov 2007)
  - http://www.sandpile.org/ia32/cpuid.htm
  - http://grafi.ii.pw.edu.pl/gbm/x86/cpuid.html,
  - "What every programmer should know about memory", Ulrich Depper, Red Hat, Inc. 
     (Nov 21, 2007). 

Example:
---
import std.cpuid;
import std.stdio;

void main()
{
    writefln(std.cpuid.toString());
}
---     
AUTHORS:  Don Clugston,
          Tomas Lindquist Olsen &lt;tomas@famolsen.dk&gt;
		(slightly altered by Walter Bright)
COPYRIGHT:	Public Domain

BUGS:	Currently only works on x86 CPUs
Macros:
	WIKI = Phobos/StdCpuid
	COPYRIGHT = Public Domain
*/

public:

/// Cache size and behaviour
struct CacheInfo
{
    /// Size of the cache, in kilobytes, per CPU.
    /// For L1 unified (data + code) caches, this size is half the physical size.
    /// (we don't halve it for larger sizes, since normally
    /// data size >> code sizefor critical loops).
	uint size;
    /// Number of ways of associativity, eg:
    /// 1 = direct mapped
    /// 2 = 2-way set associative
    /// 3 = 3-way set associative
    /// ubyte.max = fully associative
	ubyte associativity;
    /// Number of bytes read into the cache when a cache miss occurs.
	uint lineSize;
}


public:
	/// Returns vendor string, for display purposes only.
	/// Note that some CPUs have programmable vendorIDs.
	char[] vendor()		{return vendorID;}
	/// Returns processor string, for display purposes only
	char[] processor()		{return processorName;}
	
	/// The data caches. If there are fewer than 5 physical caches levels,
	/// the remaining levels are set to uint.max (== entire memory space)
	CacheInfo[5] datacache;

	/// Processor type (vendor-dependent)	
	uint stepping, model, family;


    /// Is MMX supported?
    bool mmx()			{return (features&MMX_BIT)!=0;}
    /// Is SSE supported?
    bool sse()			{return (features&SSE_BIT)!=0;}
    /// Is SSE2 supported?
    bool sse2()			{return (features&SSE2_BIT)!=0;}
    /// Is SSE3 supported?
    bool sse3()			{return (miscfeatures&SSE3_BIT)!=0;}
    /// Is SSSE3 supported?
    bool ssse3()		{return (miscfeatures&SSSE3_BIT)!=0;}

    /// Is AMD 3DNOW supported?
    bool amd3dnow()		{return (amdfeatures&AMD_3DNOW_BIT)!=0;}
    /// Is AMD 3DNOW Ext supported?
    bool amd3dnowExt()		{return (amdfeatures&AMD_3DNOW_EXT_BIT)!=0;}
    /// Are AMD extensions to MMX supported?
    bool amdMmx()		{return (amdfeatures&AMD_MMX_BIT)!=0;}
    /// Is fxsave/fxrstor supported?
    bool fxsr()			{return (features&FXSR_BIT)!=0;}

    /// Is this an Intel64 or AMD 64?
    bool isX86_64()			{return ((features&IA64_BIT)|(amdfeatures&AMD64_BIT))!=0;}

    /// Is hyperthreading supported?
    bool hyperThreading() { return maxThreads>maxCores; }
    /// Returns number of threads per CPU
    uint threadsPerCPU()	{return maxThreads;}
    /// Returns number of cores in CPU
    uint coresPerCPU()		{return maxCores;}
    
    /// Optimisation hints for assembly code.
    /// For forward compatibility, the CPU is compared against different
    /// microarchitectures. For 32-bit X86, comparisons are made against
    /// the Intel PPro/PII/PIII/PM family.
    ///
    /// The major 32-bit x86 microarchitecture 'dynasties' have been:
    /// (1) Intel P6 (PentiumPro, PII, PIII, PM, Core, Core2).
    /// (2) AMD Athlon (K7, K8, K10).
    /// (3) Intel NetBurst (Pentium 4, PentiumD).
    /// (4) Intel Pentium1, PMMX.
    /// (5) Other (Nx586, AMD K5 & K6, Centaur, Cyrix, Transmeta, etc)    
    ///
    /// Within each dynasty, the optimisation techniques are largely
    /// identical (eg, use instruction pairing for group 4). Major
    /// instruction set improvements occur within each group.
    
    /// Does this CPU perform better on AMD K7 code than PentiumPro..Core2 code?
    bool preferAthlon() { return !probablyIntel; }
    /// Does this CPU perform better on Pentium4 code than PentiumPro..Core2 code?
    bool preferPentium4() { return probablyIntel && family == 0xF; }
    /// Does this CPU perform better on Pentium I code than Pentium Pro code?
    bool preferPentium1() { return family < 6; }

private:
	
	bool probablyIntel; // true = _probably_ an Intel processor
	char [12] vendorID;
	char [] processorName;
	char [48] processorNameBuffer;
	uint numCacheLevels = 1;
	uint features = 0;     // mmx, sse, sse2, hyperthreading, etc
	uint miscfeatures = 0; // popcnt, sse3, etc.
	uint amdfeatures = 0;  // 3dnow!, mmxext, etc
	uint amdmiscfeatures = 0; // sse4a, sse5, svm, etc
	uint maxCores = 1;
	uint maxThreads = 1;
	// Note that this may indicate multi-core rather than hyperthreading.
    bool hyperThreadingBit()	{ return (features&HTT_BIT)!=0;}
    
    // feature flags
    enum : uint
    {
	    MMX_BIT = 1<<23,
	    FXSR_BIT = 1<<24,
	    SSE_BIT = 1<<25,
	    SSE2_BIT = 1<<26,
	    HTT_BIT = 1<<28,
	    IA64_BIT = 1<<30
    }
    // feature flags misc
    enum : uint
    {
	    SSE3_BIT = 1,
	    SSSE3_BIT = 1<<9,
	    SSE41_BIT = 1<<19,
	    SSE42_BIT = 1<<20
    }
    // AMD feature flags
    enum : uint
    {
	    AMD_MMX_BIT = 1<<22,
	    AMD64_BIT = 1<<29,
	    AMD_3DNOW_EXT_BIT = 1<<30,
	    AMD_3DNOW_BIT = 1<<31
    }
    // AMD misc feature flags
    enum : uint
    {
    	SSE4A_BIT = 1<<6,
    	SSE5_BIT = 1<<11
    }

version(D_InlineAsm_X86) {
// Note that this code will also work for Itanium, after changing the
// register names in the asm code.

uint max_cpuid, max_extended_cpuid;

// CPUID2: "cache and tlb information"
void getcacheinfoCPUID2()
{
	// CPUID2 is a dog's breakfast. What was Intel thinking???
	// We are only interested in the data caches
	// We only use this for old Intel CPUs, so we can assume a single-core system.
	void decipherCpuid2(ubyte x) {
		if (x==0) return;
		// Values from http://www.sandpile.org/ia32/cpuid.htm
		ubyte [] ids = [
			0x0A, 0x0C, 0x2C, 0x60, 0x0E, 0x66, 0x67, 0x68,
			// level 2 cache
			0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F,
		    0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E,
		    0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x48, 0x80, 0x81,
		    // level 3 cache
			0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D
		];
		uint [] sizes = [
			8, 16, 32, 16, 24, 8, 16, 32,
		    128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512,
		    256, 512, 1024, 2048, 512, 1024, 4096, 6*1024,
		    128, 192, 128, 256, 384, 512, 3072, 512, 128,		    
			512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024
		];
		ubyte [] ways = [
			2, 4, 8, 8, 6, 4, 4, 4,
		    4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2,
		    8, 8, 8, 8, 4, 8, 16, 24,
		    4, 6, 2, 4, 6, 4, 12, 8, 8,
			4, 8, 8, 8, 4, 8, 12, 16, 12, 16
		];
		enum { FIRSTDATA2 = 8, FIRSTDATA3 = 28+9 }
		for (int i=0; i< ids.length; ++i) {
			if (x==ids[i]) {
				int level = i< FIRSTDATA2 ? 0: i<FIRSTDATA3 ? 1 : 2;
				if (x==0x49 && family==0xF && model==0x6) level=2;
				datacache[level].size=sizes[i];
				datacache[level].associativity=ways[i];
				if (level ==3 || x==0x2C || (x>=0x48 && x<=0x80) || x==0x86 || x==0x87
					|| (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E)	){
					datacache[level].lineSize = 64;
				} else datacache[level].lineSize = 32;
			}
		}
	}

	uint[4] a;	
	bool firstTime = true;
	uint numinfos = 1;
	do {
		asm {
			mov EAX, 2;
			cpuid;
			mov a, EAX;
			mov a+4, EBX;
			mov a+8, ECX;
			mov a+12, EDX;
		}
		if (firstTime) {
			// lsb of a is how many times to loop.
			numinfos = a[0] & 0xFF;
			// and otherwise it should be ignored
			a[0] &= 0xFFFF_FF00;
			firstTime = false;
		}
		for (int c=0; c<4;++c) {
			// high bit set == no info.
			if (a[c] & 0x8000_0000) continue;
			decipherCpuid2(cast(ubyte)(a[c] & 0xFF));
			decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF));
			decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF));
			decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF));
		}
	} while (--numinfos);
}

// CPUID4: "Deterministic cache parameters" leaf
void getcacheinfoCPUID4()
{
	int cachenum = 0;
	for(;;) {
		uint a, b, number_of_sets;	
		asm {
			mov EAX, 4;
			mov ECX, cachenum;
			cpuid;
			mov a, EAX;
			mov b, EBX;
			mov number_of_sets, ECX;
		}
		++cachenum;
		if ((a&0x1F)==0) break; // no more caches
		uint numthreads = ((a>>14) & 0xFFF)  + 1;
		uint numcores = ((a>>26) & 0x3F) + 1;
		if (numcores > maxCores) maxCores = numcores;
		if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches
		
		++number_of_sets;
		ubyte level = cast(ubyte)(((a>>5)&7)-1);
		if (level > datacache.length) continue; // ignore deep caches
		datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1);
		datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size
		uint line_partitions = ((b >> 12)& 0x3FF) + 1;
		// Size = number of sets * associativity * cachelinesize * linepartitions
		// and must convert to Kb, also dividing by the number of cores.
		ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets *
			datacache[level].associativity : number_of_sets;		
		datacache[level].size = cast(uint)(
				(sz * datacache[level].lineSize * line_partitions ) / (numcores *1024));
		if (level == 0 && (a&0xF)==3) {
			// Halve the size for unified L1 caches
			datacache[level].size/=2;
		}
	}
}

// CPUID8000_0005 & 6
void getAMDcacheinfo()
{
	uint c5, c6, d6;
	asm {
		mov EAX, 0x8000_0005; // L1 cache
		cpuid;
		// EAX has L1_TLB_4M.
		// EBX has L1_TLB_4K
		// EDX has L1 instruction cache
		mov c5, ECX;
	}

	datacache[0].size = ( (c5>>24) & 0xFF);
	datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF);
	datacache[0].lineSize = c5 & 0xFF;

	if (max_extended_cpuid >= 0x8000_0006) {
		// AMD K6-III or K6-2+ or later.
		ubyte numcores = 1;
		if (max_extended_cpuid >=0x8000_0008) {
			asm {
		    	mov EAX, 0x8000_0008;
		    	cpuid;
		    	mov numcores, CL;
		    }
		    ++numcores;
		    if (numcores>maxCores) maxCores = numcores;
		}
		asm {
			mov EAX, 0x8000_0006; // L2/L3 cache
			cpuid;
			mov c6, ECX; // L2 cache info
			mov d6, EDX; // L3 cache info
		}
	
		ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ];
		datacache[1].size = (c6>>16) & 0xFFFF;
		datacache[1].associativity = assocmap[(c6>>12)&0xF];
		datacache[1].lineSize = c6 & 0xFF;
		
		// The L3 cache value is TOTAL, not per core.
		datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1.
		datacache[2].associativity = assocmap[(d6>>12)&0xF];
		datacache[2].lineSize = d6 & 0xFF;
	}
}


void cpuidX86()
{
    char * venptr = vendorID.ptr;
	asm {
		mov EAX, 0;
		cpuid;
		mov max_cpuid, EAX;
		mov EAX, venptr;
		mov [EAX], EBX;
		mov [EAX + 4], EDX;
		mov [EAX + 8], ECX;
		mov EAX, 0x8000_0000;
		cpuid;
		mov max_extended_cpuid, EAX;
	}
	
	bool probablyIntel = vendorID == "GenuineIntel";
	bool isAMD = vendorID == "AuthenticAMD";
	uint a, b, c, d;
	uint apic = 0; // brand index, apic id
	asm {
		mov EAX, 1; // model, stepping
		cpuid;
		mov a, EAX;
		mov apic, EBX;
		mov miscfeatures, ECX;
		mov features, EDX;
	}
	amdfeatures = 0;
	amdmiscfeatures = 0;
	if (max_extended_cpuid >= 0x8000_0001) {
		asm {
			mov EAX, 0x8000_0001;
			cpuid;
			mov amdmiscfeatures, ECX;
			mov amdfeatures, EDX;
		}
	}
	// Try to detect fraudulent vendorIDs
	if (amd3dnow) probablyIntel = false;
	
	stepping = a & 0xF;
	uint fbase = (a >> 8) & 0xF;
	uint mbase = (a >> 4) & 0xF;
	family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase;
	model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ?
	     mbase + ((a >> 12) & 0xF0) : mbase;
	
	if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) {
		// determine max number of cores for AMD
		asm {
			mov EAX, 0x8000_0008;
			cpuid;
			mov c, ECX;
		}
		uint apicsize = (c>>12) & 0xF;
		if (apicsize == 0) {
			// use legacy method
			if (hyperThreadingBit)	maxCores = c & 0xFF;
			else maxCores = 1;
		} else {
			// maxcores = 2^ apicsize
			maxCores = 1;
			while (apicsize) { maxCores<<=1; --apicsize; }
		}
	}
	
	if (max_extended_cpuid >= 0x8000_0004) {
		char *procptr = processorNameBuffer.ptr;
		asm {
			push ESI;
			mov ESI, procptr;
			mov EAX, 0x8000_0002;
			cpuid;
			mov [ESI], EAX;
			mov [ESI+4], EBX;
			mov [ESI+8], ECX;
			mov [ESI+12], EDX;
			mov EAX, 0x8000_0003;
			cpuid;
			mov [ESI+16], EAX;
			mov [ESI+20], EBX;
			mov [ESI+24], ECX;
			mov [ESI+28], EDX;
			mov EAX, 0x8000_0004;
			cpuid;
			mov [ESI+32], EAX;
			mov [ESI+36], EBX;
			mov [ESI+40], ECX;
			mov [ESI+44], EDX;
			pop ESI;			
		}
		// Intel P4 and PM pad at front with spaces.
		// Other CPUs pad at end with nulls.
		int start = 0, end = 0;
		while (processorNameBuffer[start] == ' ') { ++start; }
		while (processorNameBuffer[$-end-1] == 0) { ++end; }
		processorName = processorNameBuffer[start..$-end];
	} else {
		processorName = "Unknown CPU";
	}
	
	// Intel docs specify that they return 0 for 0x8000_0005.
	// AMD docs do not specify the behaviour for 0004 and 0002.
	// Centaur/VIA and most other manufacturers use the AMD method,
	// except Cyrix who apparently used CPUID2.
	// Therefore, we try the AMD method unless it's an Intel chip.
	// If we still have no info, try the Intel methods.
	datacache[0].size = 0;
	if (max_cpuid<2 || !probablyIntel) {
		if (max_extended_cpuid >= 0x8000_0005) {
			getAMDcacheinfo();
		} else if (isAMD) {		
			// According to http://grafi.ii.pw.edu.pl/gbm/x86/cpuid.html,
			// this means CPU before K5 model 1. Probably has a tiny cache.
			datacache[0].size = 8;
			datacache[0].associativity = 2;
			datacache[0].lineSize = 32;		
		}
	}	
	if ((datacache[0].size == 0) && max_cpuid>=4) {
		getcacheinfoCPUID4();
	}
	if ((datacache[0].size == 0) && max_cpuid>=2) {
		getcacheinfoCPUID2();
	}
	if (datacache[0].size == 0) {
		// Pentium, PMMX, late model 486, or an obscure CPU
		if (mmx) { // Pentium MMX. Also has 8kB code cache.
			datacache[0].size = 16;
			datacache[0].associativity = 4;
			datacache[0].lineSize = 32;		
		} else { // Pentium 1 (which also has 8kB code cache)
				 // or 486.
			datacache[0].size = 8;
			datacache[0].associativity = 2;
			datacache[0].lineSize = 32;
		}		
	}
	if (hyperThreadingBit) maxThreads = (apic>>>16) & 0xFF;
	else maxThreads = maxCores;
}

// Return true if the cpuid instruction is supported.
// BUG(WONTFIX): Doesn't work for ancient Cyrix processors.
bool hasCPUID()
{
	uint flags;
	asm {
		pushfd;
		pop EAX;
		mov flags, EAX;
		xor EAX, 0x0020_0000;
		push EAX;
		popfd;
		pushfd;
		pop EAX;
		xor flags, EAX;
	}
	return (flags & 0x0020_0000) !=0;
}

} else { // inline asm X86

	bool hasCPUID() { return false; }

	void cpuidX86()
	{
			datacache[0].size = 8;
			datacache[0].associativity = 2;
			datacache[0].lineSize = 32;		
	}	
}

// TODO: Implement this function with OS support
void cpuidPPC()
{
	enum :int  { PPC601, PPC603, PPC603E, PPC604, PPC604E, PPC620, PPCG3, PPCG4, PPCG5 };

	// TODO:
	// asm { mfpvr } returns the CPU version but unfortunately it can
	// only be used in kernel mode. So OS support is required.
	int cputype = PPC603;
	
	// 601 has a 8KB combined data & code L1 cache.
	uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64];
	ubyte ways[] = [8, 2,  4,  4,  4,  8,  8,  8,  8];
	uint L2size[]= [0, 0,  0,  0,  0,  0,  0,  256,  512];
	uint L3size[]= [0, 0,  0,  0,  0,  0,  0,  2048,  0];
    
	datacache[0].size = sizes[cputype];
	datacache[0].associativity = ways[cputype]; 
	datacache[0].lineSize = (cputype==PPCG5)? 128 : 
		(cputype == PPC620 || cputype == PPCG3)? 64 : 32;
	datacache[1].size = L2size[cputype];
	datacache[2].size = L3size[cputype];
	datacache[1].lineSize = datacache[0].lineSize;
	datacache[2].lineSize = datacache[0].lineSize;
}

// TODO: Implement this function with OS support
void cpuidSparc()
{
	// UltaSparcIIi  : L1 = 16,  2way, L2 = 512, 4 way.
	// UltraSparcIII : L1 = 64,  4way. L2= 4096 or 8192.
	// UltraSparcIIIi: L1 = 64,  4way. L2= 1024, 4 way
	// UltraSparcIV  : L1 = 64,  4way. L2 = 16*1024.
	// UltraSparcIV+ : L1 = 64,  4way. L2 = 2048, L3=32*1024.
	// Sparc64V      : L1 = 128, 2way. L2 = 4096 4way.	
}


static this()
{
	if (hasCPUID()) {
		cpuidX86();
	} else {
		// it's a 386 or 486, or a Cyrix 6x86.
		//Probably still has an external cache.
	}
	if (datacache[0].size==0) {
			// Guess same as Pentium 1.
			datacache[0].size = 8;
			datacache[0].associativity = 2;
			datacache[0].lineSize = 32;		
	}
	numCacheLevels = 1;
	// And now fill up all the unused levels with full memory space.
	for (int i=1; i< datacache.length; ++i) {
		if (datacache[i].size==0) {
			// Set all remaining levels of cache equal to full address space.
			datacache[i].size = uint.max/1024;
			datacache[i].associativity = 1;
			datacache[i].lineSize = datacache[i-1].lineSize;
		} else numCacheLevels = i+1;
	}
}

public:

import std.string : format;

/// Returns everything as a printable string
char[] toString()
{
	char[] feats;
	if (mmx)			feats ~= "MMX ";
	if (sse)			feats ~= "SSE ";
	if (sse2)			feats ~= "SSE2 ";
	if (sse3)			feats ~= "SSE3 ";
	if (ssse3)			feats ~= "SSSE3 ";
	if (amd3dnow)       feats ~= "3DNow! ";
	if (amd3dnowExt)	feats ~= "3DNow!+ ";
	if (amdMmx)			feats ~= "MMX+ ";
	if (isX86_64)       feats ~= "X86-64 ";
	if (fxsr)			feats ~= "FXSR ";
	if (hyperThreading)	feats ~= "HTT";
	
	char [] cachestr = "Data caches per CPU:\n";
	for (int i=0; i<numCacheLevels; ++i) {
		cachestr ~= format("L%d ways = %d linesize = %d size = %dK\n", i+1, datacache[i].associativity, datacache[i].lineSize, datacache[i].size);
	}
	return format(
		"Vendor string:    %s\n", vendorID,
		"Processor string: %s\n", processorName,
		"Signature:        Family = %X Model = %X Stepping = %X\n",
		     family, model, stepping,
		"Features:         %s\n", feats,
		"Multithreading:   %d cores / %d threads\n", coresPerCPU, threadsPerCPU,
		cachestr);

}

// ---------------

import std.stdio : writefln;

void main()
{
	writefln(toString);
}