/** * Identify the characteristics of the host CPU. * This code relies on information found in: - "AMD CPUID Specification", Advanced Micro Devices, revision 2.28 (April 2008) - "Intel(R) 64 and IA-32 Architectures Software Developers Manual, Volume 2A: Instruction Set Reference, A-M" (Nov 2007) - http://www.sandpile.org/ia32/cpuid.htm - http://grafi.ii.pw.edu.pl/gbm/x86/cpuid.html, - "What every programmer should know about memory", Ulrich Depper, Red Hat, Inc. (Nov 21, 2007). Example: --- import std.cpuid; import std.stdio; void main() { writefln(std.cpuid.toString()); } --- AUTHORS: Don Clugston, Tomas Lindquist Olsen <tomas@famolsen.dk> (slightly altered by Walter Bright) COPYRIGHT: Public Domain BUGS: Currently only works on x86 CPUs Macros: WIKI = Phobos/StdCpuid COPYRIGHT = Public Domain */ public: /// Cache size and behaviour struct CacheInfo { /// Size of the cache, in kilobytes, per CPU. /// For L1 unified (data + code) caches, this size is half the physical size. /// (we don't halve it for larger sizes, since normally /// data size >> code sizefor critical loops). uint size; /// Number of ways of associativity, eg: /// 1 = direct mapped /// 2 = 2-way set associative /// 3 = 3-way set associative /// ubyte.max = fully associative ubyte associativity; /// Number of bytes read into the cache when a cache miss occurs. uint lineSize; } public: /// Returns vendor string, for display purposes only. /// Note that some CPUs have programmable vendorIDs. char[] vendor() {return vendorID;} /// Returns processor string, for display purposes only char[] processor() {return processorName;} /// The data caches. If there are fewer than 5 physical caches levels, /// the remaining levels are set to uint.max (== entire memory space) CacheInfo[5] datacache; /// Processor type (vendor-dependent) uint stepping, model, family; /// Is MMX supported? bool mmx() {return (features&MMX_BIT)!=0;} /// Is SSE supported? bool sse() {return (features&SSE_BIT)!=0;} /// Is SSE2 supported? bool sse2() {return (features&SSE2_BIT)!=0;} /// Is SSE3 supported? bool sse3() {return (miscfeatures&SSE3_BIT)!=0;} /// Is SSSE3 supported? bool ssse3() {return (miscfeatures&SSSE3_BIT)!=0;} /// Is AMD 3DNOW supported? bool amd3dnow() {return (amdfeatures&AMD_3DNOW_BIT)!=0;} /// Is AMD 3DNOW Ext supported? bool amd3dnowExt() {return (amdfeatures&AMD_3DNOW_EXT_BIT)!=0;} /// Are AMD extensions to MMX supported? bool amdMmx() {return (amdfeatures&AMD_MMX_BIT)!=0;} /// Is fxsave/fxrstor supported? bool fxsr() {return (features&FXSR_BIT)!=0;} /// Is this an Intel64 or AMD 64? bool isX86_64() {return ((features&IA64_BIT)|(amdfeatures&AMD64_BIT))!=0;} /// Is hyperthreading supported? bool hyperThreading() { return maxThreads>maxCores; } /// Returns number of threads per CPU uint threadsPerCPU() {return maxThreads;} /// Returns number of cores in CPU uint coresPerCPU() {return maxCores;} /// Optimisation hints for assembly code. /// For forward compatibility, the CPU is compared against different /// microarchitectures. For 32-bit X86, comparisons are made against /// the Intel PPro/PII/PIII/PM family. /// /// The major 32-bit x86 microarchitecture 'dynasties' have been: /// (1) Intel P6 (PentiumPro, PII, PIII, PM, Core, Core2). /// (2) AMD Athlon (K7, K8, K10). /// (3) Intel NetBurst (Pentium 4, PentiumD). /// (4) Intel Pentium1, PMMX. /// (5) Other (Nx586, AMD K5 & K6, Centaur, Cyrix, Transmeta, etc) /// /// Within each dynasty, the optimisation techniques are largely /// identical (eg, use instruction pairing for group 4). Major /// instruction set improvements occur within each group. /// Does this CPU perform better on AMD K7 code than PentiumPro..Core2 code? bool preferAthlon() { return !probablyIntel; } /// Does this CPU perform better on Pentium4 code than PentiumPro..Core2 code? bool preferPentium4() { return probablyIntel && family == 0xF; } /// Does this CPU perform better on Pentium I code than Pentium Pro code? bool preferPentium1() { return family < 6; } private: bool probablyIntel; // true = _probably_ an Intel processor char [12] vendorID; char [] processorName; char [48] processorNameBuffer; uint numCacheLevels = 1; uint features = 0; // mmx, sse, sse2, hyperthreading, etc uint miscfeatures = 0; // popcnt, sse3, etc. uint amdfeatures = 0; // 3dnow!, mmxext, etc uint amdmiscfeatures = 0; // sse4a, sse5, svm, etc uint maxCores = 1; uint maxThreads = 1; // Note that this may indicate multi-core rather than hyperthreading. bool hyperThreadingBit() { return (features&HTT_BIT)!=0;} // feature flags enum : uint { MMX_BIT = 1<<23, FXSR_BIT = 1<<24, SSE_BIT = 1<<25, SSE2_BIT = 1<<26, HTT_BIT = 1<<28, IA64_BIT = 1<<30 } // feature flags misc enum : uint { SSE3_BIT = 1, SSSE3_BIT = 1<<9, SSE41_BIT = 1<<19, SSE42_BIT = 1<<20 } // AMD feature flags enum : uint { AMD_MMX_BIT = 1<<22, AMD64_BIT = 1<<29, AMD_3DNOW_EXT_BIT = 1<<30, AMD_3DNOW_BIT = 1<<31 } // AMD misc feature flags enum : uint { SSE4A_BIT = 1<<6, SSE5_BIT = 1<<11 } version(D_InlineAsm_X86) { // Note that this code will also work for Itanium, after changing the // register names in the asm code. uint max_cpuid, max_extended_cpuid; // CPUID2: "cache and tlb information" void getcacheinfoCPUID2() { // CPUID2 is a dog's breakfast. What was Intel thinking??? // We are only interested in the data caches // We only use this for old Intel CPUs, so we can assume a single-core system. void decipherCpuid2(ubyte x) { if (x==0) return; // Values from http://www.sandpile.org/ia32/cpuid.htm ubyte [] ids = [ 0x0A, 0x0C, 0x2C, 0x60, 0x0E, 0x66, 0x67, 0x68, // level 2 cache 0x41, 0x42, 0x43, 0x44, 0x45, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7F, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x49, 0x4E, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x48, 0x80, 0x81, // level 3 cache 0x22, 0x23, 0x25, 0x29, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D ]; uint [] sizes = [ 8, 16, 32, 16, 24, 8, 16, 32, 128, 256, 512, 1024, 2048, 1024, 128, 256, 512, 1024, 2048, 512, 256, 512, 1024, 2048, 512, 1024, 4096, 6*1024, 128, 192, 128, 256, 384, 512, 3072, 512, 128, 512, 1024, 2048, 4096, 4096, 8192, 6*1024, 8192, 12*1024, 16*1024 ]; ubyte [] ways = [ 2, 4, 8, 8, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 2, 8, 8, 8, 8, 4, 8, 16, 24, 4, 6, 2, 4, 6, 4, 12, 8, 8, 4, 8, 8, 8, 4, 8, 12, 16, 12, 16 ]; enum { FIRSTDATA2 = 8, FIRSTDATA3 = 28+9 } for (int i=0; i< ids.length; ++i) { if (x==ids[i]) { int level = i< FIRSTDATA2 ? 0: i=0x48 && x<=0x80) || x==0x86 || x==0x87 || (x>=0x66 && x<=0x68) || (x>=0x39 && x<=0x3E) ){ datacache[level].lineSize = 64; } else datacache[level].lineSize = 32; } } } uint[4] a; bool firstTime = true; uint numinfos = 1; do { asm { mov EAX, 2; cpuid; mov a, EAX; mov a+4, EBX; mov a+8, ECX; mov a+12, EDX; } if (firstTime) { // lsb of a is how many times to loop. numinfos = a[0] & 0xFF; // and otherwise it should be ignored a[0] &= 0xFFFF_FF00; firstTime = false; } for (int c=0; c<4;++c) { // high bit set == no info. if (a[c] & 0x8000_0000) continue; decipherCpuid2(cast(ubyte)(a[c] & 0xFF)); decipherCpuid2(cast(ubyte)((a[c]>>8) & 0xFF)); decipherCpuid2(cast(ubyte)((a[c]>>16) & 0xFF)); decipherCpuid2(cast(ubyte)((a[c]>>24) & 0xFF)); } } while (--numinfos); } // CPUID4: "Deterministic cache parameters" leaf void getcacheinfoCPUID4() { int cachenum = 0; for(;;) { uint a, b, number_of_sets; asm { mov EAX, 4; mov ECX, cachenum; cpuid; mov a, EAX; mov b, EBX; mov number_of_sets, ECX; } ++cachenum; if ((a&0x1F)==0) break; // no more caches uint numthreads = ((a>>14) & 0xFFF) + 1; uint numcores = ((a>>26) & 0x3F) + 1; if (numcores > maxCores) maxCores = numcores; if ((a&0x1F)!=1 && ((a&0x1F)!=3)) continue; // we only want data & unified caches ++number_of_sets; ubyte level = cast(ubyte)(((a>>5)&7)-1); if (level > datacache.length) continue; // ignore deep caches datacache[level].associativity = a & 0x200 ? ubyte.max :cast(ubyte)((b>>22)+1); datacache[level].lineSize = (b & 0xFFF)+ 1; // system coherency line size uint line_partitions = ((b >> 12)& 0x3FF) + 1; // Size = number of sets * associativity * cachelinesize * linepartitions // and must convert to Kb, also dividing by the number of cores. ulong sz = (datacache[level].associativity< ubyte.max)? number_of_sets * datacache[level].associativity : number_of_sets; datacache[level].size = cast(uint)( (sz * datacache[level].lineSize * line_partitions ) / (numcores *1024)); if (level == 0 && (a&0xF)==3) { // Halve the size for unified L1 caches datacache[level].size/=2; } } } // CPUID8000_0005 & 6 void getAMDcacheinfo() { uint c5, c6, d6; asm { mov EAX, 0x8000_0005; // L1 cache cpuid; // EAX has L1_TLB_4M. // EBX has L1_TLB_4K // EDX has L1 instruction cache mov c5, ECX; } datacache[0].size = ( (c5>>24) & 0xFF); datacache[0].associativity = cast(ubyte)( (c5 >> 16) & 0xFF); datacache[0].lineSize = c5 & 0xFF; if (max_extended_cpuid >= 0x8000_0006) { // AMD K6-III or K6-2+ or later. ubyte numcores = 1; if (max_extended_cpuid >=0x8000_0008) { asm { mov EAX, 0x8000_0008; cpuid; mov numcores, CL; } ++numcores; if (numcores>maxCores) maxCores = numcores; } asm { mov EAX, 0x8000_0006; // L2/L3 cache cpuid; mov c6, ECX; // L2 cache info mov d6, EDX; // L3 cache info } ubyte [] assocmap = [ 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, 0xFF ]; datacache[1].size = (c6>>16) & 0xFFFF; datacache[1].associativity = assocmap[(c6>>12)&0xF]; datacache[1].lineSize = c6 & 0xFF; // The L3 cache value is TOTAL, not per core. datacache[2].size = ((d6>>18)*512)/numcores; // could be up to 2 * this, -1. datacache[2].associativity = assocmap[(d6>>12)&0xF]; datacache[2].lineSize = d6 & 0xFF; } } void cpuidX86() { char * venptr = vendorID.ptr; asm { mov EAX, 0; cpuid; mov max_cpuid, EAX; mov EAX, venptr; mov [EAX], EBX; mov [EAX + 4], EDX; mov [EAX + 8], ECX; mov EAX, 0x8000_0000; cpuid; mov max_extended_cpuid, EAX; } bool probablyIntel = vendorID == "GenuineIntel"; bool isAMD = vendorID == "AuthenticAMD"; uint a, b, c, d; uint apic = 0; // brand index, apic id asm { mov EAX, 1; // model, stepping cpuid; mov a, EAX; mov apic, EBX; mov miscfeatures, ECX; mov features, EDX; } amdfeatures = 0; amdmiscfeatures = 0; if (max_extended_cpuid >= 0x8000_0001) { asm { mov EAX, 0x8000_0001; cpuid; mov amdmiscfeatures, ECX; mov amdfeatures, EDX; } } // Try to detect fraudulent vendorIDs if (amd3dnow) probablyIntel = false; stepping = a & 0xF; uint fbase = (a >> 8) & 0xF; uint mbase = (a >> 4) & 0xF; family = ((fbase == 0xF) || (fbase == 0)) ? fbase + (a >> 20) & 0xFF : fbase; model = ((fbase == 0xF) || (fbase == 6 && probablyIntel) ) ? mbase + ((a >> 12) & 0xF0) : mbase; if (!probablyIntel && max_extended_cpuid >= 0x8000_0008) { // determine max number of cores for AMD asm { mov EAX, 0x8000_0008; cpuid; mov c, ECX; } uint apicsize = (c>>12) & 0xF; if (apicsize == 0) { // use legacy method if (hyperThreadingBit) maxCores = c & 0xFF; else maxCores = 1; } else { // maxcores = 2^ apicsize maxCores = 1; while (apicsize) { maxCores<<=1; --apicsize; } } } if (max_extended_cpuid >= 0x8000_0004) { char *procptr = processorNameBuffer.ptr; asm { push ESI; mov ESI, procptr; mov EAX, 0x8000_0002; cpuid; mov [ESI], EAX; mov [ESI+4], EBX; mov [ESI+8], ECX; mov [ESI+12], EDX; mov EAX, 0x8000_0003; cpuid; mov [ESI+16], EAX; mov [ESI+20], EBX; mov [ESI+24], ECX; mov [ESI+28], EDX; mov EAX, 0x8000_0004; cpuid; mov [ESI+32], EAX; mov [ESI+36], EBX; mov [ESI+40], ECX; mov [ESI+44], EDX; pop ESI; } // Intel P4 and PM pad at front with spaces. // Other CPUs pad at end with nulls. int start = 0, end = 0; while (processorNameBuffer[start] == ' ') { ++start; } while (processorNameBuffer[$-end-1] == 0) { ++end; } processorName = processorNameBuffer[start..$-end]; } else { processorName = "Unknown CPU"; } // Intel docs specify that they return 0 for 0x8000_0005. // AMD docs do not specify the behaviour for 0004 and 0002. // Centaur/VIA and most other manufacturers use the AMD method, // except Cyrix who apparently used CPUID2. // Therefore, we try the AMD method unless it's an Intel chip. // If we still have no info, try the Intel methods. datacache[0].size = 0; if (max_cpuid<2 || !probablyIntel) { if (max_extended_cpuid >= 0x8000_0005) { getAMDcacheinfo(); } else if (isAMD) { // According to http://grafi.ii.pw.edu.pl/gbm/x86/cpuid.html, // this means CPU before K5 model 1. Probably has a tiny cache. datacache[0].size = 8; datacache[0].associativity = 2; datacache[0].lineSize = 32; } } if ((datacache[0].size == 0) && max_cpuid>=4) { getcacheinfoCPUID4(); } if ((datacache[0].size == 0) && max_cpuid>=2) { getcacheinfoCPUID2(); } if (datacache[0].size == 0) { // Pentium, PMMX, late model 486, or an obscure CPU if (mmx) { // Pentium MMX. Also has 8kB code cache. datacache[0].size = 16; datacache[0].associativity = 4; datacache[0].lineSize = 32; } else { // Pentium 1 (which also has 8kB code cache) // or 486. datacache[0].size = 8; datacache[0].associativity = 2; datacache[0].lineSize = 32; } } if (hyperThreadingBit) maxThreads = (apic>>>16) & 0xFF; else maxThreads = maxCores; } // Return true if the cpuid instruction is supported. // BUG(WONTFIX): Doesn't work for ancient Cyrix processors. bool hasCPUID() { uint flags; asm { pushfd; pop EAX; mov flags, EAX; xor EAX, 0x0020_0000; push EAX; popfd; pushfd; pop EAX; xor flags, EAX; } return (flags & 0x0020_0000) !=0; } } else { // inline asm X86 bool hasCPUID() { return false; } void cpuidX86() { datacache[0].size = 8; datacache[0].associativity = 2; datacache[0].lineSize = 32; } } // TODO: Implement this function with OS support void cpuidPPC() { enum :int { PPC601, PPC603, PPC603E, PPC604, PPC604E, PPC620, PPCG3, PPCG4, PPCG5 }; // TODO: // asm { mfpvr } returns the CPU version but unfortunately it can // only be used in kernel mode. So OS support is required. int cputype = PPC603; // 601 has a 8KB combined data & code L1 cache. uint sizes[] = [4, 8, 16, 16, 32, 32, 32, 32, 64]; ubyte ways[] = [8, 2, 4, 4, 4, 8, 8, 8, 8]; uint L2size[]= [0, 0, 0, 0, 0, 0, 0, 256, 512]; uint L3size[]= [0, 0, 0, 0, 0, 0, 0, 2048, 0]; datacache[0].size = sizes[cputype]; datacache[0].associativity = ways[cputype]; datacache[0].lineSize = (cputype==PPCG5)? 128 : (cputype == PPC620 || cputype == PPCG3)? 64 : 32; datacache[1].size = L2size[cputype]; datacache[2].size = L3size[cputype]; datacache[1].lineSize = datacache[0].lineSize; datacache[2].lineSize = datacache[0].lineSize; } // TODO: Implement this function with OS support void cpuidSparc() { // UltaSparcIIi : L1 = 16, 2way, L2 = 512, 4 way. // UltraSparcIII : L1 = 64, 4way. L2= 4096 or 8192. // UltraSparcIIIi: L1 = 64, 4way. L2= 1024, 4 way // UltraSparcIV : L1 = 64, 4way. L2 = 16*1024. // UltraSparcIV+ : L1 = 64, 4way. L2 = 2048, L3=32*1024. // Sparc64V : L1 = 128, 2way. L2 = 4096 4way. } static this() { if (hasCPUID()) { cpuidX86(); } else { // it's a 386 or 486, or a Cyrix 6x86. //Probably still has an external cache. } if (datacache[0].size==0) { // Guess same as Pentium 1. datacache[0].size = 8; datacache[0].associativity = 2; datacache[0].lineSize = 32; } numCacheLevels = 1; // And now fill up all the unused levels with full memory space. for (int i=1; i< datacache.length; ++i) { if (datacache[i].size==0) { // Set all remaining levels of cache equal to full address space. datacache[i].size = uint.max/1024; datacache[i].associativity = 1; datacache[i].lineSize = datacache[i-1].lineSize; } else numCacheLevels = i+1; } } public: import std.string : format; /// Returns everything as a printable string char[] toString() { char[] feats; if (mmx) feats ~= "MMX "; if (sse) feats ~= "SSE "; if (sse2) feats ~= "SSE2 "; if (sse3) feats ~= "SSE3 "; if (ssse3) feats ~= "SSSE3 "; if (amd3dnow) feats ~= "3DNow! "; if (amd3dnowExt) feats ~= "3DNow!+ "; if (amdMmx) feats ~= "MMX+ "; if (isX86_64) feats ~= "X86-64 "; if (fxsr) feats ~= "FXSR "; if (hyperThreading) feats ~= "HTT"; char [] cachestr = "Data caches per CPU:\n"; for (int i=0; i