| Thread overview | |||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
November 27, 2010 String compare performance | ||||
|---|---|---|---|---|
| ||||
While translating a common Python script to D, I have found something interesting, so I have reduced it to a little benchmark.
Below there is the reduced Python2 code (it uses Psyco), and a little program to generate some test data. The timing of the first D2 version is not good compared to the Python-Psyco program (the generation of the *300 array is a quick thing), so I have created two more D2 versions to show myself that D2 wasn't broken :-)
The reduced code looks like a syntetic benchmark, but it has about the same performance profile of a 60 lines long Python script (the original code was using xrange(0,len(...)-3,3) instead of xrange(len(...)-3), but the situation doesn't change much).
Timings, dmd compiler, best of 4, seconds:
D #1: 5.72
Psy: 1.59
D #2: 0.55
D #3: 0.34
2.3 GHz CPU.
dmd 2.050, ldc based on DMD v1.057 and llvm 2.6, gdc head based on GCC 4.4.5.
--------------------------
# Python2 version
import psyco
def test(data):
count = 0
for i in xrange(len(data) - 3):
codon = data[i : i + 3]
if codon == "TAG" or codon == "TGA" or codon == "TAA":
count += 1
return count
def main():
data = open("data.txt").read()
data = data * 300
print test(data)
psyco.full()
main()
--------------------------
To produce some test data:
# Python2 code
from itertools import islice
def rnd(ia = 3877, ic = 29573, im = 139968):
seed = 42
imf = float(im)
while 1:
seed = (seed * ia + ic) % im
r = seed / imf
yield "ACGT"[int(r * 4)]
def main():
data = "".join(islice(rnd(), 0, 120000))
file("data.txt", "w").write(data)
main()
--------------------------
A D2 traslation (I have used printf to reduce a lot the asm produced):
// D2 version #1
import std.file: read;
import std.c.stdio: printf;
int test(char[] data) {
int count;
foreach (i; 0 .. data.length - 3) {
char[] codon = data[i .. i + 3];
if (codon == "TAG" || codon == "TGA" || codon == "TAA")
count++;
}
return count;
}
void main() {
char[] data0 = cast(char[])read("data.txt");
int n = 300;
char[] data = new char[data0.length * n];
for (size_t pos; pos < data.length; pos += data0.length)
data[pos .. pos+data0.length] = data0;
printf("%d\n", test(data));
}
--------------------------
// D2 version #2
import std.file: read;
import std.c.stdio: printf;
bool cmp3(char[] s1, string s2) {
return s1[0] == s2[0] && s1[1] == s2[1] && s1[2] == s2[2];
}
int test(char[] data) {
int count;
foreach (i; 0 .. data.length - 3) {
char[] codon = data[i .. i + 3];
if (cmp3(codon, "TAG") || cmp3(codon, "TGA") || cmp3(codon, "TAA"))
count++;
}
return count;
}
void main() {
char[] data0 = cast(char[])read("data.txt");
int n = 300;
char[] data = new char[data0.length * n];
for (size_t pos; pos < data.length; pos += data0.length)
data[pos .. pos+data0.length] = data0;
printf("%d\n", test(data));
}
--------------------------
// D2 version #3
import std.file: read;
import std.c.stdio: printf;
int test(char[] data) {
int count;
foreach (i; 0 .. data.length - 3) {
if (data[i] == 'T') {
if (data[i+1] == 'A') {
if (data[i+2] == 'G') {
count++;
} else if (data[i+2] == 'A') {
count++;
}
} else if (data[i+1] == 'G') {
if (data[i+2] == 'A')
count++;
}
}
}
return count;
}
void main() {
char[] data0 = cast(char[])read("data.txt");
int n = 300;
char[] data = new char[data0.length * n];
for (size_t pos; pos < data.length; pos += data0.length)
data[pos .. pos+data0.length] = data0;
printf("%d\n", test(data));
}
--------------------------
dmd -O -release -inline test1.d
_D5test14testFAaZi comdat
L0: sub ESP,024h
push EBX
xor EBX,EBX
push EBP
mov EBP,030h[ESP]
push ESI
xor ESI,ESI
add EBP,0FFFFFFFDh
push EDI
je LA8
mov EDX,03Ch[ESP]
mov EAX,038h[ESP]
mov EDI,EDX
L22: mov ECX,offset FLAT:_D11TypeInfo_Aa6__initZ
lea EAX,3[EBX]
sub EAX,EBX
push ECX
lea EDX,[EDI][EBX]
push dword ptr FLAT:_DATA[0Ch]
push dword ptr FLAT:_DATA[08h]
mov 020h[ESP],EAX
mov 024h[ESP],EDX
push EDX
push EAX
call near ptr __adEq2
add ESP,014h
test EAX,EAX
jne L9E
mov ECX,offset FLAT:_D11TypeInfo_Aa6__initZ
push ECX
push dword ptr FLAT:_DATA[01Ch]
push dword ptr FLAT:_DATA[018h]
push dword ptr 024h[ESP]
push dword ptr 024h[ESP]
call near ptr __adEq2
add ESP,014h
test EAX,EAX
jne L9E
mov EDX,offset FLAT:_D11TypeInfo_Aa6__initZ
push EDX
push dword ptr FLAT:_DATA[02Ch]
push dword ptr FLAT:_DATA[028h]
push dword ptr 024h[ESP]
push dword ptr 024h[ESP]
call near ptr __adEq2
add ESP,014h
test EAX,EAX
je L9F
L9E: inc ESI
L9F: inc EBX
cmp EBX,EBP
jb L22
LA8: pop EDI
mov EAX,ESI
pop ESI
pop EBP
pop EBX
add ESP,024h
ret 8
--------------------------
dmd -O -release -inline test2.d
_D5test24testFAaZi comdat
sub ESP,03Ch
mov ECX,040h[ESP]
push EBX
push EBP
push ESI
xor ESI,ESI
push EDI
xor EDI,EDI
add ECX,0FFFFFFFDh
mov 01Ch[ESP],ECX
je LC5
mov EDX,054h[ESP]
mov EAX,050h[ESP]
mov EBP,EDX
L26: lea EBX,3[ESI]
sub EBX,ESI
lea ECX,[EBP][ESI]
mov 028h[ESP],ECX
mov DL,[ECX]
mov ECX,FLAT:_DATA[0Ch]
mov 024h[ESP],EBX
mov EAX,FLAT:_DATA[08h]
cmp DL,[ECX]
mov 010h[ESP],ECX
jne L65
mov EDX,028h[ESP]
mov EAX,EBX
mov EBX,010h[ESP]
mov CL,1[EDX]
cmp CL,1[EBX]
jne L65
mov DL,2[EDX]
cmp DL,2[EBX]
je LB9
L65: mov EDX,028h[ESP]
mov CL,[EDX]
mov 018h[ESP],EDX
mov EAX,024h[ESP]
mov EDX,FLAT:_DATA[01Ch]
cmp CL,[EDX]
mov EAX,FLAT:_DATA[018h]
jne L96
mov EBX,018h[ESP]
mov AL,1[EBX]
cmp AL,1[EDX]
jne L96
mov AL,2[EBX]
cmp AL,2[EDX]
je LB9
L96: mov EDX,FLAT:_DATA[02Ch]
mov EAX,FLAT:_DATA[028h]
cmp CL,[EDX]
jne LBA
mov ECX,018h[ESP]
mov BL,1[ECX]
cmp BL,1[EDX]
jne LBA
mov CL,2[ECX]
cmp CL,2[EDX]
jne LBA
LB9: inc EDI
LBA: inc ESI
cmp ESI,01Ch[ESP]
jb L26
LC5: mov EAX,EDI
pop EDI
pop ESI
pop EBP
pop EBX
add ESP,03Ch
ret 8
--------------------------
dmd -O -release -inline test3.d
_D5test34testFAaZi comdat
push EAX
xor EDX,EDX
push EBX
xor EBX,EBX
push ESI
mov ESI,010h[ESP]
add ESI,0FFFFFFFDh
je L4A
mov 8[ESP],EDX
mov EDX,014h[ESP]
mov ECX,EDX
mov EAX,010h[ESP]
mov EDX,8[ESP]
L22: cmp [ECX][EDX],054h
jne L45
mov AL,1[ECX][EDX]
cmp AL,041h
jne L39
cmp byte ptr 2[ECX][EDX],047h
je L44
jmp short L3D
L39: cmp AL,047h
jne L45
L3D: cmp byte ptr 2[ECX][EDX],041h
jne L45
L44: inc EBX
L45: inc EDX
cmp EDX,ESI
jb L22
L4A: pop ESI
mov EAX,EBX
pop EBX
pop ECX
ret 8
--------------------------
gdc -O3 -frelease -inline -S test1.d -o test1.s
_D5test14testFAaZi:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $12, %esp
movl 8(%ebp), %eax
movl 12(%ebp), %edx
movl $0, -24(%ebp)
subl $3, %eax
movl %edx, -20(%ebp)
movl %eax, -16(%ebp)
je .L5
xorl %edx, %edx
jmp .L8
.p2align 4,,7
.p2align 3
.L6:
addl $1, -24(%ebp)
addl $1, %edx
cmpl %edx, -16(%ebp)
jbe .L5
.L8:
movl -20(%ebp), %eax
movl $.LC0, %edi
movl $3, %ecx
addl %edx, %eax
movl %eax, %esi
repz cmpsb
je .L6
movl %eax, %esi
movl $.LC1, %edi
movl $3, %ecx
repz cmpsb
je .L6
movl $.LC2, %edi
movl %eax, %esi
movl $3, %ecx
repz cmpsb
je .L6
addl $1, %edx
cmpl %edx, -16(%ebp)
ja .L8
.L5:
movl -24(%ebp), %eax
addl $12, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
--------------------------------------
gdc -O3 -frelease -inline -S test2.d -o test2.s
_D5test24testFAaZi:
pushl %ebp
xorl %edx, %edx
movl %esp, %ebp
xorl %eax, %eax
pushl %edi
pushl %esi
pushl %ebx
subl $16, %esp
movl 8(%ebp), %ebx
movl 12(%ebp), %esi
movl %ebx, %edi
subl $3, %edi
jne .L18
jmp .L9
.p2align 4,,7
.p2align 3
.L10:
movl -16(%ebp), %ecx
movzbl .LC1, %ebx
cmpb (%ecx), %bl
je .L15
.L12:
movzbl .LC2, %ebx
cmpb (%ecx), %bl
je .L21
.L13:
addl $1, %edx
cmpl %edx, %edi
jbe .L9
.p2align 4,,7
.p2align 3
.L18:
leal (%esi,%edx), %ecx
movzbl .LC0, %ebx
movl $3, -20(%ebp)
movl %ecx, -16(%ebp)
cmpb (%ecx), %bl
jne .L10
movzbl .LC0+1, %ebx
cmpb 1(%ecx), %bl
jne .L10
movzbl .LC0+2, %ebx
cmpb 2(%ecx), %bl
jne .L10
.L11:
addl $1, %edx
addl $1, %eax
cmpl %edx, %edi
ja .L18
.L9:
addl $16, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.p2align 4,,7
.p2align 3
.L15:
movzbl .LC1+1, %ebx
cmpb 1(%ecx), %bl
jne .L12
movzbl .LC1+2, %ebx
cmpb 2(%ecx), %bl
je .L11
movzbl .LC2, %ebx
cmpb (%ecx), %bl
jne .L13
.p2align 4,,7
.p2align 3
.L21:
movzbl .LC2+1, %ebx
cmpb 1(%ecx), %bl
jne .L13
movzbl .LC2+2, %ebx
cmpb 2(%ecx), %bl
jne .L13
jmp .L11
--------------------------------------
gdc -O3 -frelease -inline -S test3.d -o test3.s
_D5test34testFAaZi:
pushl %ebp
xorl %eax, %eax
movl %esp, %ebp
pushl %esi
movl 8(%ebp), %esi
pushl %ebx
movl 12(%ebp), %ebx
subl $3, %esi
je .L6
movl $1, %edx
jmp .L7
.p2align 4,,7
.p2align 3
.L3:
cmpl %edx, %esi
leal 1(%edx), %ecx
jbe .L6
.L11:
movl %ecx, %edx
.L7:
cmpb $84, -1(%ebx,%edx)
jne .L3
movzbl (%ebx,%edx), %ecx
cmpb $65, %cl
je .L10
cmpb $71, %cl
jne .L3
xorl %ecx, %ecx
cmpb $65, 1(%ebx,%edx)
sete %cl
addl %ecx, %eax
cmpl %edx, %esi
leal 1(%edx), %ecx
ja .L11
.p2align 4,,7
.p2align 3
.L6:
popl %ebx
popl %esi
popl %ebp
ret
---------------------------
ldc -O3 -release -inline -output-s test1.d
_D5test14testFAaZi:
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
subl $20, %esp
movl 40(%esp), %esi
cmpl $3, %esi
je .LBB1_8
addl $4294967293, %esi
xorl %ebx, %ebx
movl %ebx, %edi
.align 16
.LBB1_2:
movl 44(%esp), %eax
leal (%eax,%ebx), %ebp
movl %ebp, 4(%esp)
movl $_D11TypeInfo_Aa6__initZ, 16(%esp)
movl $.str, 12(%esp)
movl $3, 8(%esp)
movl $3, (%esp)
call _adEq
testl %eax, %eax
jne .LBB1_5
movl %ebp, 4(%esp)
movl $_D11TypeInfo_Aa6__initZ, 16(%esp)
movl $.str1, 12(%esp)
movl $3, 8(%esp)
movl $3, (%esp)
call _adEq
testl %eax, %eax
jne .LBB1_5
movl %ebp, 4(%esp)
movl $_D11TypeInfo_Aa6__initZ, 16(%esp)
movl $.str2, 12(%esp)
movl $3, 8(%esp)
movl $3, (%esp)
call _adEq
testl %eax, %eax
je .LBB1_6
.LBB1_5:
incl %edi
.LBB1_6:
incl %ebx
cmpl %esi, %ebx
jb .LBB1_2
.LBB1_7:
movl %edi, %eax
addl $20, %esp
popl %esi
popl %edi
popl %ebx
popl %ebp
ret $8
.LBB1_8:
xorl %edi, %edi
jmp .LBB1_7
---------------------------
ldc -O3 -release -inline -output-s test2.d
_D5test24testFAaZi:
pushl %ebx
pushl %esi
movl 12(%esp), %ecx
cmpl $3, %ecx
je .LBB2_14
movl 16(%esp), %edx
addl $4294967293, %ecx
xorl %eax, %eax
movl %eax, %esi
.align 16
.LBB2_2:
movb (%edx,%esi), %bl
cmpb $84, %bl
jne .LBB2_12
movb 1(%edx,%esi), %bh
cmpb $65, %bh
jne .LBB2_5
cmpb $71, 2(%esi,%edx)
je .LBB2_11
.LBB2_5:
cmpb $84, %bl
jne .LBB2_12
cmpb $71, %bh
jne .LBB2_8
cmpb $65, 2(%esi,%edx)
je .LBB2_11
.LBB2_8:
cmpb $65, %bh
setne %bh
cmpb $84, %bl
jne .LBB2_12
testb %bh, %bh
jne .LBB2_12
cmpb $65, 2(%esi,%edx)
jne .LBB2_12
.LBB2_11:
incl %eax
.LBB2_12:
incl %esi
cmpl %ecx, %esi
jb .LBB2_2
.LBB2_13:
popl %esi
popl %ebx
ret $8
.LBB2_14:
xorl %eax, %eax
jmp .LBB2_13
---------------------------
ldc -O3 -release -inline -output-s test3.d
_D5test34testFAaZi:
pushl %ebx
pushl %edi
pushl %esi
movl 16(%esp), %ecx
cmpl $3, %ecx
je .LBB1_12
movl 20(%esp), %edx
addl $4294967293, %ecx
xorl %eax, %eax
movl %eax, %esi
.align 16
.LBB1_2:
cmpb $84, (%edx,%esi)
jne .LBB1_10
movb 1(%edx,%esi), %bl
cmpb $71, %bl
je .LBB1_8
cmpb $65, %bl
jne .LBB1_10
movb 2(%esi,%edx), %bl
cmpb $71, %bl
jne .LBB1_7
incl %eax
jmp .LBB1_10
.LBB1_7:
cmpb $65, %bl
jmp .LBB1_9
.LBB1_8:
cmpb $65, 2(%esi,%edx)
.LBB1_9:
sete %bl
movzbl %bl, %edi
addl %edi, %eax
.LBB1_10:
incl %esi
cmpl %ecx, %esi
jb .LBB1_2
.LBB1_11:
popl %esi
popl %edi
popl %ebx
ret $8
.LBB1_12:
xorl %eax, %eax
jmp .LBB1_11
---------------------------
Bye,
bearophile
| ||||
November 27, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | how well does dmd perform when you use a switch statement? | |||
November 27, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Ellery Newcomer | Ellery Newcomer:
> how well does dmd perform when you use a switch statement?
4th D version:
import std.file: read;
import std.c.stdio: printf;
int test(char[] data) {
int count;
foreach (i; 0 .. data.length - 3) {
switch (data[i .. i + 3]) {
case "TAG", "TGA", "TAA":
count++;
default:
}
}
return count;
}
void main() {
char[] data0 = cast(char[])read("data.txt");
int n = 300;
char[] data = new char[data0.length * n];
for (size_t pos; pos < data.length; pos += data0.length)
data[pos .. pos+data0.length] = data0;
printf("%d\n", test(data));
}
Timings, dmd compiler, best of 4, seconds:
D #1: 5.72
D #4: 1.84
Psy: 1.59
D #2: 0.55
D #3: 0.34
Bye,
bearophile
| |||
November 27, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | On Sat, 27 Nov 2010 11:53:06 -0500
bearophile <bearophileHUGS@lycos.com> wrote:
> Timings, dmd compiler, best of 4, seconds:
> D #1: 5.72
> Psy: 1.59
> D #2: 0.55
> D #3: 0.34
>
>
> 2.3 GHz CPU.
> dmd 2.050, ldc based on DMD v1.057 and llvm 2.6, gdc head based on GCC 4.4.5.
>
> --------------------------
>
> # Python2 version
> import psyco
>
> def test(data):
> count = 0
> for i in xrange(len(data) - 3):
> codon = data[i : i + 3]
> if codon == "TAG" or codon == "TGA" or codon == "TAA":
> count += 1
> return count
>
> def main():
> data = open("data.txt").read()
> data = data * 300
> print test(data)
>
> psyco.full()
> main()
>
> --------------------------
>
> To produce some test data:
>
>
> # Python2 code
> from itertools import islice
>
> def rnd(ia = 3877, ic = 29573, im = 139968):
> seed = 42
> imf = float(im)
> while 1:
> seed = (seed * ia + ic) % im
> r = seed / imf
> yield "ACGT"[int(r * 4)]
>
>
> def main():
> data = "".join(islice(rnd(), 0, 120000))
> file("data.txt", "w").write(data)
>
> main()
>
> --------------------------
>
> A D2 traslation (I have used printf to reduce a lot the asm produced):
>
>
> // D2 version #1
> import std.file: read;
> import std.c.stdio: printf;
>
> int test(char[] data) {
> int count;
> foreach (i; 0 .. data.length - 3) {
> char[] codon = data[i .. i + 3];
> if (codon == "TAG" || codon == "TGA" || codon == "TAA")
> count++;
> }
> return count;
> }
>
> void main() {
> char[] data0 = cast(char[])read("data.txt");
> int n = 300;
> char[] data = new char[data0.length * n];
> for (size_t pos; pos < data.length; pos += data0.length)
> data[pos .. pos+data0.length] = data0;
>
> printf("%d\n", test(data));
> }
>
> --------------------------
>
> // D2 version #2
> import std.file: read;
> import std.c.stdio: printf;
>
> bool cmp3(char[] s1, string s2) {
> return s1[0] == s2[0] && s1[1] == s2[1] && s1[2] == s2[2];
> }
>
> int test(char[] data) {
> int count;
> foreach (i; 0 .. data.length - 3) {
> char[] codon = data[i .. i + 3];
> if (cmp3(codon, "TAG") || cmp3(codon, "TGA") || cmp3(codon, "TAA"))
> count++;
> }
> return count;
> }
>
> void main() {
> char[] data0 = cast(char[])read("data.txt");
> int n = 300;
> char[] data = new char[data0.length * n];
> for (size_t pos; pos < data.length; pos += data0.length)
> data[pos .. pos+data0.length] = data0;
>
> printf("%d\n", test(data));
> }
Impressive!
What's your diagnostic on why basic string compare parforms the way it does?
And have you tried a naive hand-written strComp (just for the sake of completeness) like:
bool strComp (string s1, string s2) {
auto l = s1.length;
if (l != s2.length)
return false;
foreach (uint i ; 0..l)
if (s1[i] != s2[i])
return false;
return true;
}
(I bet this will perform at about the same order of magnitude as your version D#1.)
Also, is there a way to bit-compare given memory areas at much higher speed than element per element (I mean for arrays in general)?
Denis
-- -- -- -- -- -- --
vit esse estrany ☣
spir.wikidot.com
| |||
November 27, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to spir | denis.spir > What's your diagnostic on why basic string compare parforms the way it does? I prefer the numbers and asm listings speak for themselves, because my diagnostic sometimes is wrong. But I guess it's caused by three not inlined calls to __adEq2 each loop, done to compare 3 char long arrays. > And have you tried a naive hand-written strComp (just for the sake of completeness) like: Here it is: Timings, dmd compiler, best of 4, seconds: D #1: 5.72 D #4: 1.84 D #5: 1.73 Psy: 1.59 D #2: 0.55 D #3: 0.34 // D version #5 import std.file: read; import std.c.stdio: printf; bool cmp2(char[] s1, string s2) { if (s1.length != s2.length) return false; foreach (uint i ; 0 .. s1.length) if (s1[i] != s2[i]) return false; return true; } int test(char[] data) { int count; foreach (i; 0 .. data.length - 3) { char[] codon = data[i .. i + 3]; if (cmp2(codon, "TAG") || cmp2(codon, "TGA") || cmp2(codon, "TAA")) count++; } return count; } void main() { char[] data0 = cast(char[])read("data.txt"); int n = 300; char[] data = new char[data0.length * n]; for (size_t pos; pos < data.length; pos += data0.length) data[pos .. pos+data0.length] = data0; printf("%d\n", test(data)); } > Also, is there a way to bit-compare given memory areas at much higher speed than element per element (I mean for arrays in general)? I don't know. I think you can't. Bye, bearophile | |||
November 27, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | On Saturday 27 November 2010 14:46:00 bearophile wrote:
> denis.spir
> > Also, is there a way to bit-compare given memory areas at much higher speed than element per element (I mean for arrays in general)?
>
> I don't know. I think you can't.
You might be able to do that for primitive types and for structs with no opEquals() and with no member variables with an opEquals(), but you certainly can't do it in the general case, since in the general case, you'd have to be calling opEquals() on each element individually.
- Jonathan M Davis
| |||
November 28, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to bearophile | On 11/27/2010 11:53 AM, bearophile wrote:
> While translating a common Python script to D, I have found something interesting, so I have reduced it to a little benchmark.
>
> Below there is the reduced Python2 code (it uses Psyco), and a little program to generate some test data. The timing of the first D2 version is not good compared to the Python-Psyco program (the generation of the *300 array is a quick thing), so I have created two more D2 versions to show myself that D2 wasn't broken :-)
>
> The reduced code looks like a syntetic benchmark, but it has about the same performance profile of a 60 lines long Python script (the original code was using xrange(0,len(...)-3,3) instead of xrange(len(...)-3), but the situation doesn't change much).
>
It's not clear to me if the point of your post is "how do I make this go faster?" or "Waa! D underperforms Python by default".
If it is the former, a state machine is probably the right answer for long input strings.
start -> E
E A,C,G:-> E
E T:-> T
T A:-> TA
T C:-> E
T G:-> TG
T T:-> T
TA A,G:-> E, {++count}
TA C:-> E
TA T:-> T
TG A:-> E, {++count}
TG C,G:-> E
TG T:-> T
Since the probability of failure is about .75, 0.5, {.5 or .75}, the overall comparison cost is about 1 + .25 + .125 = 1.375, times compare+loop times n, plus all the function call overhead, etc.
Using a DFA should reduce the cost to 1 * index, times n, assuming you are using a small alphabet (I'm guessing that your codons are ascii+uppercase only, table-size = 26).
=Austin
| |||
November 28, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Austin Hastings | On Saturday 27 November 2010 22:13:39 Austin Hastings wrote:
> On 11/27/2010 11:53 AM, bearophile wrote:
> > While translating a common Python script to D, I have found something interesting, so I have reduced it to a little benchmark.
> >
> > Below there is the reduced Python2 code (it uses Psyco), and a little program to generate some test data. The timing of the first D2 version is not good compared to the Python-Psyco program (the generation of the *300 array is a quick thing), so I have created two more D2 versions to show myself that D2 wasn't broken :-)
> >
> > The reduced code looks like a syntetic benchmark, but it has about the
> > same performance profile of a 60 lines long Python script (the original
> > code was using xrange(0,len(...)-3,3) instead of xrange(len(...)-3),
> > but the situation doesn't change much).
>
> It's not clear to me if the point of your post is "how do I make this go faster?" or "Waa! D underperforms Python by default".
What I think is pretty clear from this is that at some point, some work needs to be done to optimize array comparisons when memcmp can be used instead of having to call the individual opEquals() of each element. It's not the sort of thing that's likely to be a priority, but it really should be done at some point.
- Jonathan M Davis
| |||
November 28, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
On Saturday 27 November 2010 22:23:39 Jonathan M Davis wrote: > On Saturday 27 November 2010 22:13:39 Austin Hastings wrote: > > On 11/27/2010 11:53 AM, bearophile wrote: > > > While translating a common Python script to D, I have found something interesting, so I have reduced it to a little benchmark. > > > > > > Below there is the reduced Python2 code (it uses Psyco), and a little program to generate some test data. The timing of the first D2 version is not good compared to the Python-Psyco program (the generation of the *300 array is a quick thing), so I have created two more D2 versions to show myself that D2 wasn't broken :-) > > > > > > The reduced code looks like a syntetic benchmark, but it has about the > > > same performance profile of a 60 lines long Python script (the original > > > code was using xrange(0,len(...)-3,3) instead of xrange(len(...)-3), > > > but the situation doesn't change much). > > > > It's not clear to me if the point of your post is "how do I make this go faster?" or "Waa! D underperforms Python by default". > > What I think is pretty clear from this is that at some point, some work needs to be done to optimize array comparisons when memcmp can be used instead of having to call the individual opEquals() of each element. It's not the sort of thing that's likely to be a priority, but it really should be done at some point. Enhancement Request: http://d.puremagic.com/issues/show_bug.cgi?id=5282 | ||||
November 28, 2010 Re: String compare performance | ||||
|---|---|---|---|---|
| ||||
Posted in reply to Austin Hastings | Austin Hastings: > It's not clear to me if the point of your post is "how do I make this go faster?" or "Waa! D underperforms Python by default". The third D version in my post is fast enough, so the point of that post is that a very common string compare operation, done in the natural way, is a bit too much slow in D/DMD. > If it is the former, a state machine is probably the right answer for long input strings. This case is very simple, so a simple tree (as in the third D version of my post) is probably good enough. In some similar but more complex situations of genomic processing I've seen that in C it's good to have computed gotos, to implement faster state machines. See my recent post "explicitly unimplemented computed gotos" :-) This was my use case beside the desire to create interpreters in D (it's a similar thing). > Since the probability of failure is about .75, 0.5, {.5 or .75}, the overall comparison cost is about 1 + .25 + .125 = 1.375, times compare+loop times n, plus all the function call overhead, etc. All 3 codons start with T, so if you test that first letter before the 3 string compare you speed up the original code a lot. But I was looking for a more general solution. Implementing a state machine is good, but in this case I'd like the compiler to do more optimizations by itself. A simple but good enough optimization for the compiler is to replace tests with strings (known to be short at compile-time) with inlined comparisons of their single chars. > Using a DFA should reduce the cost to 1 * index, times n, assuming you are using a small alphabet (I'm guessing that your codons are ascii+uppercase only, table-size = 26). In this case the alphabet was just 4 symbols: {'A', 'C', 'G', 'T'} (with no undecided or modified nucleosides). Bye, bearophile | |||
Copyright © 1999-2021 by the D Language Foundation
Permalink
Reply