| Thread overview | ||||||||
|---|---|---|---|---|---|---|---|---|
|
February 16, 2011 std.range.zip performance | ||||
|---|---|---|---|---|
| ||||
While doing some benchmarks on the Clay language I've found something interesting.
I have timed a zip() done on two short arrays, and I have found this D version too much slow compared to normal array code. Higher order functions like zip, map, filter, etc are going to stay, they aren't toys, so I think the D compiler has to digest them better.
(There is lot of of asm at the tail of this post, I don't know if some of t will get cut away.)
Bye,
bearophile
Timings, seconds, best of 3:
#1: 1.95
#2: 61.50
#3: 2.25
#4: 1.52
----------------------
Binary sizes, bytes:
#1: 33_016
#2: 284_188
#3: 103_964
#4: 103_964
----------------------
Compilers and command line used:
DMD 2.051
clay compiler (hg id dec41ba07d58 tip, llvm r122866, Jan 5 2011)
clay -inline test1.clay -o test1.exe
dmd -O -release -inline test2.d
dmd -O -release -inline test3.d
To produce asm code with Clay:
clay -inline -asm test1.clay -o test1.s
Output: -1149672960 for all versions.
----------------------
Code used:
// program #1 (Clay)
main() {
var a = Vector([18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28]);
var b = Vector([9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25]);
var tot = 0; // 32 bit signed int
for (i in range(0, 50*1000*1000))
for (x, y in zipped(a, b))
tot += x + y;
println(tot);
}
----------------------
// program #2 (D2)
import std.c.stdio: printf;
import std.range: zip;
void main() {
auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
int tot = 0;
foreach (i; 0 .. 50_000_000)
foreach (xy; zip(a, b))
tot += xy[0] + xy[1];
printf("%d\n", tot);
}
----------------------
// program #3 (D2)
import std.c.stdio: printf;
void main() {
auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
int tot = 0;
foreach (i; 0 .. 50_000_000)
foreach (j; 0 .. a.length)
tot += a[j] + b[j];
printf("%d\n", tot);
}
----------------------
// program #4 (D2)
import std.c.stdio: printf;
void main() {
auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
int tot = 0;
foreach (i; 0 .. 50_000_000)
foreach (j; 0 .. 30)
tot += a[j] + b[j];
printf("%d\n", tot);
}
----------------------
Just loops program #1:
movl $50000000, %eax # imm = 0x2FAF080
.align 16, 0x90
LBB2_4: # %bb.nph.i.i
# =>This Loop Header: Depth=1
# Child Loop BB2_5 Depth 2
xorl %edx, %edx
.align 16, 0x90
LBB2_5: # %return410.i.i
# Parent Loop BB2_4 Depth=1
# => This Inner Loop Header: Depth=2
addl (%esi,%edx,4), %ecx
addl (%edi,%edx,4), %ecx
incl %edx
cmpl $30, %edx
jne LBB2_5
# BB#3: # %return272.loopexit.i.i
# in Loop: Header=BB2_4 Depth=1
decl %eax
jne LBB2_4
----------------------
Just loops program #2:
LCE: push dword ptr 018h[ESP]
mov ESI,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip6__initZ
push dword ptr 018h[ESP]
push dword ptr 028h[ESP]
push dword ptr 028h[ESP]
push 0
lea EDI,078h[ESP]
movsd
movsd
movsd
movsd
movsd
lea EAX,078h[ESP]
call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip
mov ESI,EAX
lea EDI,044h[ESP]
movsd
movsd
movsd
movsd
movsd
lea ESI,044h[ESP]
lea EDI,024h[ESP]
movsd
movsd
movsd
movsd
movsd
lea EAX,024h[ESP]
call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
xor AL,1
je L153
L11C: lea EAX,024h[ESP]
call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
mov 07Ch[ESP],EAX
mov ECX,07Ch[ESP]
lea EAX,024h[ESP]
mov 080h[ESP],EDX
add ECX,080h[ESP]
add EBX,ECX
call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv
lea EAX,024h[ESP]
call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
xor AL,1
jne L11C
L153: inc EBP
cmp EBP,02FAF080h
jb LCE
_D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip comdat
push EBX
mov ECX,8[ESP]
mov EBX,014h[ESP]
push ESI
mov ESI,EAX
mov EDX,01Ch[ESP]
mov 4[ESI],EDX
mov EDX,014h[ESP]
mov [ESI],EBX
mov EBX,010h[ESP]
mov 010h[ESI],ECX
mov 8[ESI],EBX
mov 0Ch[ESI],EDX
pop ESI
pop EBX
ret 014h
_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb comdat
L0: sub ESP,05Ch
mov ECX,010h[EAX]
test ECX,ECX
push EBX
push ESI
mov ESI,EAX
je L21
cmp ECX,1
je LC3
cmp ECX,2
je L55
jmp near ptr LC3
L21: mov EBX,[ESI]
mov EDX,4[ESI]
mov 0Ch[ESP],EBX
mov 010h[ESP],EDX
cmp dword ptr 0Ch[ESP],0
je L4A
mov EBX,8[ESI]
mov EDX,0Ch[ESI]
mov 014h[ESP],EBX
mov 018h[ESP],EDX
cmp dword ptr 014h[ESP],0
jne LC3
L4A: pop ESI
mov EAX,1
pop EBX
add ESP,05Ch
ret
L55: mov EBX,[ESI]
mov EDX,4[ESI]
mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
mov 01Ch[ESP],EBX
mov 020h[ESP],EDX
mov 02Ch[ESP],EAX
mov 030h[ESP],ECX
cmp dword ptr 01Ch[ESP],0
setz DL
mov EBX,8[ESI]
mov 024h[ESP],EBX
mov ECX,0Ch[ESI]
mov 028h[ESP],ECX
cmp dword ptr 024h[ESP],0
setz CL
cmp DL,CL
mov EDX,1
je L98
xor EDX,EDX
L98: xor DL,1
je LC3
push dword ptr FLAT:_DATA[054h]
push dword ptr FLAT:_DATA[050h]
push 0ADEh
mov EAX,038h[ESP]
mov EDX,03Ch[ESP]
mov EBX,038h[ESP]
call EDX
push EDX
push EAX
call near ptr _D3std9exception7bailOutFAyaixAaZv
LC3: pop ESI
xor EAX,EAX
pop EBX
add ESP,05Ch
ret
_D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple comdat
assume CS:_D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
L0: sub ESP,028h
mov EDX,4[EAX]
push EBX
push ESI
mov ESI,EAX
mov EBX,[ESI]
push EDI
mov 014h[ESP],EBX
mov 018h[ESP],EDX
cmp dword ptr 014h[ESP],0
je L31
lea EDX,0Ch[ESP]
mov EBX,[ESI]
push EDX
mov EDX,4[ESI]
mov EAX,[EDX]
push 4
call near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
jmp short L41
L31: lea ECX,0Ch[ESP]
mov EDI,4
push ECX
push EDI
call near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
L41: mov EBX,8[ESI]
mov ECX,0Ch[ESI]
test EBX,EBX
je L61
lea EDI,010h[ESP]
mov EDX,0Ch[ESI]
mov EAX,8[ESI]
push EDI
mov EAX,[EDX]
push 4
call near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
jmp short L71
L61: lea ECX,010h[ESP]
mov ESI,4
push ECX
push ESI
call near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
L71: mov EDX,010h[ESP]
mov EAX,0Ch[ESP]
pop EDI
pop ESI
pop EBX
add ESP,028h
ret
_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv comdat
L0: sub ESP,04Ch
mov ECX,010h[EAX]
test ECX,ECX
push EBX
push EBP
push ESI
push EDI
mov EDI,EAX
je L1F
cmp ECX,1
je L4A
cmp ECX,2
je L8C
jmp near ptr L142
L1F: mov ESI,[EDI]
mov EDX,4[EDI]
dec ESI
mov EBX,[EDI]
add EDX,4
lea EBP,8[EDI]
mov [EDI],ESI
mov 4[EDI],EDX
mov ESI,0[EBP]
mov EDX,4[EBP]
dec ESI
mov EBX,0[EBP]
add EDX,4
mov 0[EBP],ESI
mov 4[EBP],EDX
jmp near ptr L142
L4A: mov ESI,[EDI]
mov ECX,4[EDI]
test ESI,ESI
je L63
mov ESI,[EDI]
mov EDX,4[EDI]
dec ESI
mov EBX,[EDI]
add EDX,4
mov [EDI],ESI
mov 4[EDI],EDX
L63: mov ESI,8[EDI]
mov ECX,0Ch[EDI]
test ESI,ESI
je L142
lea EBP,8[EDI]
mov ESI,0[EBP]
mov EDX,4[EBP]
dec ESI
mov EBX,0[EBP]
add EDX,4
mov 0[EBP],ESI
mov 4[EBP],EDX
jmp near ptr L142
L8C: mov EBX,[EDI]
mov EDX,4[EDI]
mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral831MFZAxa
mov 014h[ESP],EBX
mov 018h[ESP],EDX
mov 01Ch[ESP],EAX
mov 020h[ESP],ECX
cmp dword ptr 014h[ESP],0
setnz DL
xor DL,1
je LD7
mov EDX,ECX
push dword ptr FLAT:_DATA[054h]
push dword ptr FLAT:_DATA[050h]
push 0B86h
mov EAX,028h[ESP]
mov EBX,028h[ESP]
call EDX
push EDX
push EAX
call near ptr _D3std9exception7bailOutFAyaixAaZv
LD7: mov EAX,[EDI]
mov EDX,4[EDI]
dec EAX
mov EBX,[EDI]
add EDX,4
mov 4[EDI],EDX
mov EDX,0Ch[EDI]
mov EBX,EDI
mov [EDI],EAX
mov EAX,8[EDI]
mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral832MFZAxa
mov 024h[ESP],EAX
mov 028h[ESP],EDX
mov 02Ch[ESP],EBX
mov 030h[ESP],ECX
cmp dword ptr 024h[ESP],0
setnz DL
xor DL,1
je L12F
push dword ptr FLAT:_DATA[054h]
push dword ptr FLAT:_DATA[050h]
push 0B86h
mov EAX,038h[ESP]
call ECX
push EDX
push EAX
call near ptr _D3std9exception7bailOutFAyaixAaZv
L12F: lea ESI,8[EDI]
mov EBX,[ESI]
mov EDX,4[ESI]
dec EBX
mov EAX,[ESI]
mov [ESI],EBX
add EDX,4
mov 4[ESI],EDX
L142: pop EDI
pop ESI
pop EBP
pop EBX
add ESP,04Ch
ret
_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb comdat
L0: sub ESP,05Ch
mov ECX,010h[EAX]
test ECX,ECX
push EBX
push ESI
mov ESI,EAX
je L21
cmp ECX,1
je LC3
cmp ECX,2
je L55
jmp near ptr LC3
L21: mov EBX,[ESI]
mov EDX,4[ESI]
mov 0Ch[ESP],EBX
mov 010h[ESP],EDX
cmp dword ptr 0Ch[ESP],0
je L4A
mov EBX,8[ESI]
mov EDX,0Ch[ESI]
mov 014h[ESP],EBX
mov 018h[ESP],EDX
cmp dword ptr 014h[ESP],0
jne LC3
L4A: pop ESI
mov EAX,1
pop EBX
add ESP,05Ch
ret
L55: mov EBX,[ESI]
mov EDX,4[ESI]
mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
mov 01Ch[ESP],EBX
mov 020h[ESP],EDX
mov 02Ch[ESP],EAX
mov 030h[ESP],ECX
cmp dword ptr 01Ch[ESP],0
setz DL
mov EBX,8[ESI]
mov 024h[ESP],EBX
mov ECX,0Ch[ESI]
mov 028h[ESP],ECX
cmp dword ptr 024h[ESP],0
setz CL
cmp DL,CL
mov EDX,1
je L98
xor EDX,EDX
L98: xor DL,1
je LC3
push dword ptr FLAT:_DATA[054h]
push dword ptr FLAT:_DATA[050h]
push 0ADEh
mov EAX,038h[ESP]
mov EDX,03Ch[ESP]
mov EBX,038h[ESP]
call EDX
push EDX
push EAX
call near ptr _D3std9exception7bailOutFAyaixAaZv
LC3: pop ESI
xor EAX,EAX
pop EBX
add ESP,05Ch
ret
_D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi comdat
L0: push EAX
mov EDX,offset FLAT:_D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi14__dgliteral829MFZC6object9Throwable
push EBX
push ESI
cmp dword ptr 010h[ESP],4
sbb ECX,ECX
inc ECX
push ECX
lea EBX,0Ch[ESP]
push EDX
push EBX
call near ptr _D3std9exception14__T7enforceTbZ7enforceFbLC6object9ThrowableZb
mov ECX,014h[ESP]
mov EAX,014h[ESP]
mov ESI,8[ESP]
mov [ECX],ESI
pop ESI
pop EBX
pop ECX
ret 8
(I have not included all the code, like _D3std9exception7bailOutFAyaixAaZv).
----------------------
Just loops program #3:
LDE: xor EBX,EBX
cmp 010h[ESP],BL
je LF5
LE6: mov ECX,[EBX*4][EAX]
add ECX,[EBX*4][EDI]
add ESI,ECX
inc EBX
cmp EBX,014h[ESP]
jb LE6
LF5: inc EDX
cmp EDX,02FAF080h
jb LDE
----------------------
Just loops program #4:
LBF: xor EBX,EBX
LC1: mov ECX,[EBX*4][EAX]
add ECX,[EBX*4][EDI]
add ESI,ECX
inc EBX
cmp EBX,01Eh
jb LC1
inc EDX
cmp EDX,02FAF080h
jb LBF
----------------------
| ||||
February 16, 2011 Re: std.range.zip performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | I have done one more test, with a much simpler zip(). The code is now faster, but it seems there's no hope to have an acceptable performance. Maybe D needs a built-in zip as Clay.
Bye,
bearophile
-----------------------
Timings, seconds, best of 3:
#1: 1.95
#2: 61.50
#3: 2.25
#4: 1.52
#5: 15.50
Note: removing the this() the program #5 gets about 2-3 seconds faster.
-----------------------
// program #5 (D2)
import std.c.stdio: printf;
struct zip {
int[] arr1, arr2;
size_t i;
static struct Element { int _0, _1; }
this(int[] a1, int[] a2)
in {
assert(a1.length == a2.length);
} body {
arr1 = a1;
arr2 = a2;
}
bool empty() {
return i >= arr1.length;
}
@property Element front() {
return Element(arr1[i], arr2[i]);
}
void popFront() {
i++;
}
}
void main() {
auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
int tot = 0;
foreach (i; 0 .. 50_000_000)
foreach (xy; zip(a, b))
tot += xy._0 + xy._1;
printf("%d\n", tot);
}
----------------------
Just loops program #5:
LCE: mov 024h[ESP],EBX
lea EBX,054h[ESP]
xor EDX,EDX
mov [EBX],EDX
mov EAX,014h[ESP]
lea ESI,054h[ESP]
mov 4[EBX],EDX
lea EDI,034h[ESP]
mov 8[EBX],EDX
mov 0Ch[EBX],EDX
mov 010h[EBX],EDX
mov EDX,018h[ESP]
mov 058h[ESP],EDX
mov EDX,020h[ESP]
mov 054h[ESP],EAX
mov EAX,01Ch[ESP]
mov 05Ch[ESP],EAX
mov 060h[ESP],EDX
movsd
movsd
movsd
movsd
movsd
mov ECX,044h[ESP]
cmp ECX,034h[ESP]
jae L167
L11D: mov EBX,044h[ESP]
mov EDX,038h[ESP]
mov ESI,[EBX*4][EDX]
mov EAX,034h[ESP]
mov EDX,040h[ESP]
mov ECX,[EBX*4][EDX]
mov 078h[ESP],ECX
mov EAX,03Ch[ESP]
mov EDX,078h[ESP]
mov 074h[ESP],ESI
mov EAX,074h[ESP]
mov EBX,074h[ESP]
mov 070h[ESP],EDX
add EBX,070h[ESP]
add EBP,EBX
inc dword ptr 044h[ESP]
mov ESI,044h[ESP]
mov 06Ch[ESP],EAX
cmp ESI,034h[ESP]
jb L11D
L167: mov EBX,024h[ESP]
inc EBX
cmp EBX,02FAF080h
jb LCE
----------------------
| |||
February 16, 2011 Re: std.range.zip performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | Initial: 58 seconds.
Eliminated the switch in popFront: 53s.
Replaced emplace with assignment: 23s.
Specialized emplace for non-struct types, reinserted: 23s.
Eliminated the loop in empty (replaced with return ranges[0].empty;): 17s.
I'm sure there are ways to further improve this, but there are a few difficulties. Each pass through the loop the code must transport values from the two arrays into a specific format and then distribute them for further calculation. Then, upon each popFront two words must be touched because arrays hold pointer+length, not pointer+pointer (as probably would be better since ranges are around).
Nice analysis!
Andrei
On 2/15/11 6:24 PM, bearophile wrote:
> While doing some benchmarks on the Clay language I've found something interesting.
>
> I have timed a zip() done on two short arrays, and I have found this D version too much slow compared to normal array code. Higher order functions like zip, map, filter, etc are going to stay, they aren't toys, so I think the D compiler has to digest them better.
>
> (There is lot of of asm at the tail of this post, I don't know if some of t will get cut away.)
>
> Bye,
> bearophile
>
>
> Timings, seconds, best of 3:
> #1: 1.95
> #2: 61.50
> #3: 2.25
> #4: 1.52
>
> ----------------------
>
> Binary sizes, bytes:
> #1: 33_016
> #2: 284_188
> #3: 103_964
> #4: 103_964
>
> ----------------------
>
> Compilers and command line used:
>
> DMD 2.051
> clay compiler (hg id dec41ba07d58 tip, llvm r122866, Jan 5 2011)
>
> clay -inline test1.clay -o test1.exe
> dmd -O -release -inline test2.d
> dmd -O -release -inline test3.d
>
> To produce asm code with Clay:
> clay -inline -asm test1.clay -o test1.s
>
> Output: -1149672960 for all versions.
>
> ----------------------
>
> Code used:
>
>
> // program #1 (Clay)
> main() {
> var a = Vector([18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28]);
> var b = Vector([9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25]);
> var tot = 0; // 32 bit signed int
> for (i in range(0, 50*1000*1000))
> for (x, y in zipped(a, b))
> tot += x + y;
> println(tot);
> }
>
> ----------------------
>
> // program #2 (D2)
> import std.c.stdio: printf;
> import std.range: zip;
> void main() {
> auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
> auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
> int tot = 0;
> foreach (i; 0 .. 50_000_000)
> foreach (xy; zip(a, b))
> tot += xy[0] + xy[1];
> printf("%d\n", tot);
> }
>
> ----------------------
>
> // program #3 (D2)
> import std.c.stdio: printf;
> void main() {
> auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
> auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
> int tot = 0;
> foreach (i; 0 .. 50_000_000)
> foreach (j; 0 .. a.length)
> tot += a[j] + b[j];
> printf("%d\n", tot);
> }
>
> ----------------------
>
> // program #4 (D2)
> import std.c.stdio: printf;
> void main() {
> auto a = [18,10,17,14,19,23,11,0,6,0,17,25,5,4,19,21,17,13,5,7,11,22,23,17,24,7,11,11,1,28];
> auto b = [9,12,1,4,1,18,11,6,5,18,24,15,26,14,24,8,17,26,23,17,3,28,27,0,9,27,0,19,13,25];
> int tot = 0;
> foreach (i; 0 .. 50_000_000)
> foreach (j; 0 .. 30)
> tot += a[j] + b[j];
> printf("%d\n", tot);
> }
>
> ----------------------
>
> Just loops program #1:
>
> movl $50000000, %eax # imm = 0x2FAF080
> .align 16, 0x90
> LBB2_4: # %bb.nph.i.i
> # =>This Loop Header: Depth=1
> # Child Loop BB2_5 Depth 2
> xorl %edx, %edx
> .align 16, 0x90
> LBB2_5: # %return410.i.i
> # Parent Loop BB2_4 Depth=1
> # => This Inner Loop Header: Depth=2
> addl (%esi,%edx,4), %ecx
> addl (%edi,%edx,4), %ecx
> incl %edx
> cmpl $30, %edx
> jne LBB2_5
> # BB#3: # %return272.loopexit.i.i
> # in Loop: Header=BB2_4 Depth=1
> decl %eax
> jne LBB2_4
>
> ----------------------
>
> Just loops program #2:
>
> LCE: push dword ptr 018h[ESP]
> mov ESI,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip6__initZ
> push dword ptr 018h[ESP]
> push dword ptr 028h[ESP]
> push dword ptr 028h[ESP]
> push 0
> lea EDI,078h[ESP]
> movsd
> movsd
> movsd
> movsd
> movsd
> lea EAX,078h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip
> mov ESI,EAX
> lea EDI,044h[ESP]
> movsd
> movsd
> movsd
> movsd
> movsd
> lea ESI,044h[ESP]
> lea EDI,024h[ESP]
> movsd
> movsd
> movsd
> movsd
> movsd
> lea EAX,024h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
> xor AL,1
> je L153
> L11C: lea EAX,024h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
> mov 07Ch[ESP],EAX
> mov ECX,07Ch[ESP]
> lea EAX,024h[ESP]
> mov 080h[ESP],EDX
> add ECX,080h[ESP]
> add EBX,ECX
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv
> lea EAX,024h[ESP]
> call near ptr _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb
> xor AL,1
> jne L11C
> L153: inc EBP
> cmp EBP,02FAF080h
> jb LCE
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip6__ctorMFNcAiAiE3std5range14StoppingPolicyZS3std5range14__T3ZipTAiTAiZ3Zip comdat
> push EBX
> mov ECX,8[ESP]
> mov EBX,014h[ESP]
> push ESI
> mov ESI,EAX
> mov EDX,01Ch[ESP]
> mov 4[ESI],EDX
> mov EDX,014h[ESP]
> mov [ESI],EBX
> mov EBX,010h[ESP]
> mov 010h[ESI],ECX
> mov 8[ESI],EBX
> mov 0Ch[ESI],EDX
> pop ESI
> pop EBX
> ret 014h
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb comdat
> L0: sub ESP,05Ch
> mov ECX,010h[EAX]
> test ECX,ECX
> push EBX
> push ESI
> mov ESI,EAX
> je L21
> cmp ECX,1
> je LC3
> cmp ECX,2
> je L55
> jmp near ptr LC3
> L21: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov 0Ch[ESP],EBX
> mov 010h[ESP],EDX
> cmp dword ptr 0Ch[ESP],0
> je L4A
> mov EBX,8[ESI]
> mov EDX,0Ch[ESI]
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> cmp dword ptr 014h[ESP],0
> jne LC3
> L4A: pop ESI
> mov EAX,1
> pop EBX
> add ESP,05Ch
> ret
> L55: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
> mov 01Ch[ESP],EBX
> mov 020h[ESP],EDX
> mov 02Ch[ESP],EAX
> mov 030h[ESP],ECX
> cmp dword ptr 01Ch[ESP],0
> setz DL
> mov EBX,8[ESI]
> mov 024h[ESP],EBX
> mov ECX,0Ch[ESI]
> mov 028h[ESP],ECX
> cmp dword ptr 024h[ESP],0
> setz CL
> cmp DL,CL
> mov EDX,1
> je L98
> xor EDX,EDX
> L98: xor DL,1
> je LC3
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0ADEh
> mov EAX,038h[ESP]
> mov EDX,03Ch[ESP]
> mov EBX,038h[ESP]
> call EDX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> LC3: pop ESI
> xor EAX,EAX
> pop EBX
> add ESP,05Ch
> ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple comdat
> assume CS:_D3std5range14__T3ZipTAiTAiZ3Zip5frontMFNdZS3std8typecons14__T5TupleTiTiZ5Tuple
> L0: sub ESP,028h
> mov EDX,4[EAX]
> push EBX
> push ESI
> mov ESI,EAX
> mov EBX,[ESI]
> push EDI
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> cmp dword ptr 014h[ESP],0
> je L31
> lea EDX,0Ch[ESP]
> mov EBX,[ESI]
> push EDX
> mov EDX,4[ESI]
> mov EAX,[EDX]
> push 4
> call near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
> jmp short L41
> L31: lea ECX,0Ch[ESP]
> mov EDI,4
> push ECX
> push EDI
> call near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
> L41: mov EBX,8[ESI]
> mov ECX,0Ch[ESI]
> test EBX,EBX
> je L61
> lea EDI,010h[ESP]
> mov EDX,0Ch[ESI]
> mov EAX,8[ESI]
> push EDI
> mov EAX,[EDX]
> push 4
> call near ptr _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi
> jmp short L71
> L61: lea ECX,010h[ESP]
> mov ESI,4
> push ECX
> push ESI
> call near ptr _D3std4conv14__T7emplaceTiZ7emplaceFAvZPi
> L71: mov EDX,010h[ESP]
> mov EAX,0Ch[ESP]
> pop EDI
> pop ESI
> pop EBX
> add ESP,028h
> ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv comdat
> L0: sub ESP,04Ch
> mov ECX,010h[EAX]
> test ECX,ECX
> push EBX
> push EBP
> push ESI
> push EDI
> mov EDI,EAX
> je L1F
> cmp ECX,1
> je L4A
> cmp ECX,2
> je L8C
> jmp near ptr L142
> L1F: mov ESI,[EDI]
> mov EDX,4[EDI]
> dec ESI
> mov EBX,[EDI]
> add EDX,4
> lea EBP,8[EDI]
> mov [EDI],ESI
> mov 4[EDI],EDX
> mov ESI,0[EBP]
> mov EDX,4[EBP]
> dec ESI
> mov EBX,0[EBP]
> add EDX,4
> mov 0[EBP],ESI
> mov 4[EBP],EDX
> jmp near ptr L142
> L4A: mov ESI,[EDI]
> mov ECX,4[EDI]
> test ESI,ESI
> je L63
> mov ESI,[EDI]
> mov EDX,4[EDI]
> dec ESI
> mov EBX,[EDI]
> add EDX,4
> mov [EDI],ESI
> mov 4[EDI],EDX
> L63: mov ESI,8[EDI]
> mov ECX,0Ch[EDI]
> test ESI,ESI
> je L142
> lea EBP,8[EDI]
> mov ESI,0[EBP]
> mov EDX,4[EBP]
> dec ESI
> mov EBX,0[EBP]
> add EDX,4
> mov 0[EBP],ESI
> mov 4[EBP],EDX
> jmp near ptr L142
> L8C: mov EBX,[EDI]
> mov EDX,4[EDI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral831MFZAxa
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> mov 01Ch[ESP],EAX
> mov 020h[ESP],ECX
> cmp dword ptr 014h[ESP],0
> setnz DL
> xor DL,1
> je LD7
> mov EDX,ECX
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0B86h
> mov EAX,028h[ESP]
> mov EBX,028h[ESP]
> call EDX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> LD7: mov EAX,[EDI]
> mov EDX,4[EDI]
> dec EAX
> mov EBX,[EDI]
> add EDX,4
> mov 4[EDI],EDX
> mov EDX,0Ch[EDI]
> mov EBX,EDI
> mov [EDI],EAX
> mov EAX,8[EDI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip8popFrontMFZv14__dgliteral832MFZAxa
> mov 024h[ESP],EAX
> mov 028h[ESP],EDX
> mov 02Ch[ESP],EBX
> mov 030h[ESP],ECX
> cmp dword ptr 024h[ESP],0
> setnz DL
> xor DL,1
> je L12F
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0B86h
> mov EAX,038h[ESP]
> call ECX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> L12F: lea ESI,8[EDI]
> mov EBX,[ESI]
> mov EDX,4[ESI]
> dec EBX
> mov EAX,[ESI]
> mov [ESI],EBX
> add EDX,4
> mov 4[ESI],EDX
> L142: pop EDI
> pop ESI
> pop EBP
> pop EBX
> add ESP,04Ch
> ret
>
>
> _D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb comdat
> L0: sub ESP,05Ch
> mov ECX,010h[EAX]
> test ECX,ECX
> push EBX
> push ESI
> mov ESI,EAX
> je L21
> cmp ECX,1
> je LC3
> cmp ECX,2
> je L55
> jmp near ptr LC3
> L21: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov 0Ch[ESP],EBX
> mov 010h[ESP],EDX
> cmp dword ptr 0Ch[ESP],0
> je L4A
> mov EBX,8[ESI]
> mov EDX,0Ch[ESI]
> mov 014h[ESP],EBX
> mov 018h[ESP],EDX
> cmp dword ptr 014h[ESP],0
> jne LC3
> L4A: pop ESI
> mov EAX,1
> pop EBX
> add ESP,05Ch
> ret
> L55: mov EBX,[ESI]
> mov EDX,4[ESI]
> mov ECX,offset FLAT:_D3std5range14__T3ZipTAiTAiZ3Zip5emptyMFZb14__dgliteral828MFZAxa
> mov 01Ch[ESP],EBX
> mov 020h[ESP],EDX
> mov 02Ch[ESP],EAX
> mov 030h[ESP],ECX
> cmp dword ptr 01Ch[ESP],0
> setz DL
> mov EBX,8[ESI]
> mov 024h[ESP],EBX
> mov ECX,0Ch[ESI]
> mov 028h[ESP],ECX
> cmp dword ptr 024h[ESP],0
> setz CL
> cmp DL,CL
> mov EDX,1
> je L98
> xor EDX,EDX
> L98: xor DL,1
> je LC3
> push dword ptr FLAT:_DATA[054h]
> push dword ptr FLAT:_DATA[050h]
> push 0ADEh
> mov EAX,038h[ESP]
> mov EDX,03Ch[ESP]
> mov EBX,038h[ESP]
> call EDX
> push EDX
> push EAX
> call near ptr _D3std9exception7bailOutFAyaixAaZv
> LC3: pop ESI
> xor EAX,EAX
> pop EBX
> add ESP,05Ch
> ret
>
>
> _D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi comdat
> L0: push EAX
> mov EDX,offset FLAT:_D3std4conv16__T7emplaceTiTiZ7emplaceFAviZPi14__dgliteral829MFZC6object9Throwable
> push EBX
> push ESI
> cmp dword ptr 010h[ESP],4
> sbb ECX,ECX
> inc ECX
> push ECX
> lea EBX,0Ch[ESP]
> push EDX
> push EBX
> call near ptr _D3std9exception14__T7enforceTbZ7enforceFbLC6object9ThrowableZb
> mov ECX,014h[ESP]
> mov EAX,014h[ESP]
> mov ESI,8[ESP]
> mov [ECX],ESI
> pop ESI
> pop EBX
> pop ECX
> ret 8
>
>
> (I have not included all the code, like _D3std9exception7bailOutFAyaixAaZv).
>
> ----------------------
>
> Just loops program #3:
>
> LDE: xor EBX,EBX
> cmp 010h[ESP],BL
> je LF5
> LE6: mov ECX,[EBX*4][EAX]
> add ECX,[EBX*4][EDI]
> add ESI,ECX
> inc EBX
> cmp EBX,014h[ESP]
> jb LE6
> LF5: inc EDX
> cmp EDX,02FAF080h
> jb LDE
>
> ----------------------
>
> Just loops program #4:
>
> LBF: xor EBX,EBX
> LC1: mov ECX,[EBX*4][EAX]
> add ECX,[EBX*4][EDI]
> add ESI,ECX
> inc EBX
> cmp EBX,01Eh
> jb LC1
> inc EDX
> cmp EDX,02FAF080h
> jb LBF
>
> ----------------------
| |||
February 16, 2011 Re: std.range.zip performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Andrei Alexandrescu | On 02/16/2011 03:36 AM, Andrei Alexandrescu wrote:
> Initial: 58 seconds.
>
> Eliminated the switch in popFront: 53s.
>
> Replaced emplace with assignment: 23s.
>
> Specialized emplace for non-struct types, reinserted: 23s.
>
> Eliminated the loop in empty (replaced with return ranges[0].empty;): 17s.
>
> I'm sure there are ways to further improve this, but there are a few
> difficulties. Each pass through the loop the code must transport values from
> the two arrays into a specific format and then distribute them for further
> calculation. Then, upon each popFront two words must be touched because arrays
> hold pointer+length, not pointer+pointer (as probably would be better since
> ranges are around).
>
>
> Nice analysis!
Bearophile, is clay's zip func lazy. Else, it could explain part of its performance, don't you think?
Denis
| |||
February 16, 2011 Re: std.range.zip performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to spir | Am 16.02.2011 10:25, schrieb spir:
> On 02/16/2011 03:36 AM, Andrei Alexandrescu wrote:
>> Initial: 58 seconds.
>>
>> Eliminated the switch in popFront: 53s.
>>
>> Replaced emplace with assignment: 23s.
>>
>> Specialized emplace for non-struct types, reinserted: 23s.
>>
>> Eliminated the loop in empty (replaced with return ranges[0].empty;): 17s.
>>
>> I'm sure there are ways to further improve this, but there are a few
>> difficulties. Each pass through the loop the code must transport values from
>> the two arrays into a specific format and then distribute them for further
>> calculation. Then, upon each popFront two words must be touched because arrays
>> hold pointer+length, not pointer+pointer (as probably would be better since
>> ranges are around).
>>
>>
>> Nice analysis!
>
> Bearophile, is clay's zip func lazy. Else, it could explain part of its
> performance, don't you think?
>
> Denis
why should it - what can lazyness help here?
| |||
February 16, 2011 Re: std.range.zip performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Andrei Alexandrescu | Andrei: > I'm sure there are ways to further improve this, but there are a few difficulties. Each pass through the loop the code must transport values from the two arrays into a specific format and then distribute them for further calculation. Then, upon each popFront two words must be touched because arrays hold pointer+length, not pointer+pointer (as probably would be better since ranges are around).< HOFs as zip/map/filter aren't D built-ins as in Python, but they are basic language constructs. If the D front-end becomes a bit aware of what a zip is, it probably becomes able to optimize away most or all those data movements. The performance difference between the #3, #4, #5 and #5b (without zip constructor) shows there's more space for improvements, optimization-wise. In the Haskell Prelude (a standard module imported and compiled before any user code) shows the implementation of zip(l1,l2) and zip3(l1,l2,l3): zip :: [a] -> [b] -> [(a,b)] zip = zipWith (,) zip3 :: [a] -> [b] -> [c] -> [(a,b,c)] zip3 = zipWith3 (,,) zipWith :: (a->b->c) -> [a]->[b]->[c] zipWith z (a:as) (b:bs) = z a b : zipWith z as bs zipWith _ _ _ = [] zipWith3 :: (a->b->c->d) -> [a]->[b]->[c]->[d] zipWith3 z (a:as) (b:bs) (c:cs) = z a b c : zipWith3 z as bs cs zipWith3 _ _ _ _ = [] They are tiny compared to Phobos code. They are efficient enough, and they have no corner cases. ---------------------- spir: >is clay's zip func lazy.< It seems lazy, it's not an array of records, and the printing function refuses to print it. Bye, bearophile | |||
Copyright © 1999-2021 by the D Language Foundation
Permalink
Reply