July 02, 2010 [phobos] std.algorithm.sort slow as molasses | ||||
---|---|---|---|---|
| ||||
Posted in reply to David Simcha | Couldn't some of the overhead come from the overhead of calling back()?
Andrei
David Simcha wrote:
> Sorry, the list wouldn't post my accidental double paste anyhow because it was too big. Here's the disassembly of partition, showing only back() is getting called and pred seems to be getting inlined, contrary to what I had believed before. So changing enforce() in back() to assert() gets rid of about half the overhead, but I have no idea where the other half is coming from.
>
> _D3std9algorithm129__T8sortImplS793std10functional54__T13binaryFunImplVAyaa5_61203c2062VAyaa1_610889BE3F182A6EA5575A2D298A20D100
> PROC NEAR
> ; COMDEF
> _D3std9algorithm129__T8sortImplS793std10functional54__T13binaryFunImplVAyaa5_61203c2062VAyaa1_610889BE3F182A6EA5575A2D298A20D100
> sub esp, 36 ; 0000 _ 83. EC, 24
> mov ecx, dword ptr [esp+2CH] ; 0003 _ 8B. 4C
> 24, 2C
> push ebx ; 0007 _ 53
> mov ebx, dword ptr [esp+2CH] ; 0008 _ 8B. 5C
> 24, 2C
> test ebx, ebx ; 000C _ 85. DB
> push ebp ; 000E _ 55
> push esi ; 000F _ 56
> mov esi, eax ; 0010 _ 89. C6
> push edi ; 0012 _ 57
> jnz ?_020 ; 0013 _ 75, 0E
> pop edi ; 0015 _ 5F
> mov eax, ebx ; 0016 _ 8B. C3
> mov edx, ecx ; 0018 _ 8B. D1
> pop esi ; 001A _ 5E
> pop ebp ; 001B _ 5D
> pop ebx ; 001C _ 5B
> add esp, 36 ; 001D _ 83. C4, 24
> ret 8 ; 0020 _ C2, 0008
>
> ?_020: mov dword ptr [esp+3CH], ecx ; 0023 _ 89. 4C
> 24, 3C
> mov edx, dword ptr [esp+3CH] ; 0027 _ 8B. 54
> 24, 3C
> mov dword ptr [esp+38H], ebx ; 002B _ 89. 5C
> 24, 38
> mov ebx, dword ptr [esp+38H] ; 002F _ 8B. 5C
> 24, 38
> mov dword ptr [esp+14H], ebx ; 0033 _ 89. 5C
> 24, 14
> mov dword ptr [esp+18H], edx ; 0037 _ 89. 54
> 24, 18
> cmp dword ptr [esp+38H], 0 ; 003B _ 83. 7C
> 24, 38, 00
> je ?_023 ; 0040 _ 0F 84,
> 00000088
> ?_021: mov edx, dword ptr [esp+3CH] ; 0046 _ 8B. 54
> 24, 3C
> mov eax, dword ptr [esp+38H] ; 004A _ 8B. 44
> 24, 38
> mov ebx, dword ptr [edx] ; 004E _ 8B. 1A
> push dword ptr [esi+0CH] ; 0050 _ FF. 76, 0C
> push dword ptr [esi+8H] ; 0053 _ FF. 76, 08
> mov dword ptr [esp+18H], edx ; 0056 _ 89. 54
> 24, 18
> call _D3std5array12__T4backTAiZ4backFNcAiZi ; 005A _ E8,
> 00000000(rel)
> cmp dword ptr [eax], ebx ; 005F _ 39. 18
> jle ?_022 ; 0061 _ 7E, 31
> mov edi, dword ptr [esp+38H] ; 0063 _ 8B. 7C
> 24, 38
> mov ecx, dword ptr [esp+10H] ; 0067 _ 8B. 4C
> 24, 10
> dec edi ; 006B _ 4F
> lea edx, [ecx+4H] ; 006C _ 8D. 51, 04
> mov ebx, dword ptr [esp+14H] ; 006F _ 8B. 5C
> 24, 14
> dec ebx ; 0073 _ 4B
> mov dword ptr [esp+3CH], edx ; 0074 _ 89. 54
> 24, 3C
> mov edx, dword ptr [esp+18H] ; 0078 _ 8B. 54
> 24, 18
> mov eax, dword ptr [esp+14H] ; 007C _ 8B. 44
> 24, 14
> mov dword ptr [esp+38H], edi ; 0080 _ 89. 7C
> 24, 38
> add edx, 4 ; 0084 _ 83. C2, 04
> mov dword ptr [esp+14H], ebx ; 0087 _ 89. 5C
> 24, 14
> mov dword ptr [esp+18H], edx ; 008B _ 89. 54
> 24, 18
> jmp ?_025 ; 008F _ E9,
> 000000AA
>
> ?_022: push dword ptr [esp+3CH] ; 0094 _ FF. 74
> 24, 3C
> push dword ptr [esp+3CH] ; 0098 _ FF. 74
> 24, 3C
> call _D3std5array12__T4backTAiZ4backFNcAiZi ; 009C _ E8,
> 00000000(rel)
> mov ebx, dword ptr [eax] ; 00A1 _ 8B. 18
> push dword ptr [esi+0CH] ; 00A3 _ FF. 76, 0C
> push dword ptr [esi+8H] ; 00A6 _ FF. 76, 08
> call _D3std5array12__T4backTAiZ4backFNcAiZi ; 00A9 _ E8,
> 00000000(rel)
> cmp dword ptr [eax], ebx ; 00AE _ 39. 18
> jg ?_024 ; 00B0 _ 7F, 2E
> mov edi, dword ptr [esp+38H] ; 00B2 _ 8B. 7C
> 24, 38
> dec edi ; 00B6 _ 4F
> mov eax, dword ptr [esp+38H] ; 00B7 _ 8B. 44
> 24, 38
> mov dword ptr [esp+38H], edi ; 00BB _ 89. 7C
> 24, 38
> mov edx, dword ptr [esp+3CH] ; 00BF _ 8B. 54
> 24, 3C
> mov dword ptr [esp+3CH], edx ; 00C3 _ 89. 54
> 24, 3C
> cmp dword ptr [esp+38H], 0 ; 00C7 _ 83. 7C
> 24, 38, 00
> jnz ?_022 ; 00CC _ 75, C6
> ?_023: mov edx, dword ptr [esp+18H] ; 00CE _ 8B. 54
> 24, 18
> mov eax, dword ptr [esp+14H] ; 00D2 _ 8B. 44
> 24, 14
> pop edi ; 00D6 _ 5F
> pop esi ; 00D7 _ 5E
> pop ebp ; 00D8 _ 5D
> pop ebx ; 00D9 _ 5B
> add esp, 36 ; 00DA _ 83. C4, 24
> ret 8 ; 00DD _ C2, 0008
>
> ?_024: push dword ptr [esp+3CH] ; 00E0 _ FF. 74
> 24, 3C
> push dword ptr [esp+3CH] ; 00E4 _ FF. 74
> 24, 3C
> call _D3std5array12__T4backTAiZ4backFNcAiZi ; 00E8 _ E8,
> 00000000(rel)
> mov edi, eax ; 00ED _ 89. C7
> mov edx, dword ptr [esp+3CH] ; 00EF _ 8B. 54
> 24, 3C
> mov ebp, edx ; 00F3 _ 89. D5
> mov eax, dword ptr [esp+38H] ; 00F5 _ 8B. 44
> 24, 38
> ; Note: Zero displacement could be omitted
> mov ecx, dword ptr [edx] ; 00F9 _ 8B. 4A, 00
> mov ebx, dword ptr [edi] ; 00FC _ 8B. 1F
> lea edx, [edx+4H] ; 00FE _ 8D. 52, 04
> mov dword ptr [ebp], ebx ; 0101 _ 89. 5D, 00
> mov eax, dword ptr [esp+38H] ; 0104 _ 8B. 44
> 24, 38
> dec eax ; 0108 _ 48
> mov dword ptr [edi], ecx ; 0109 _ 89. 0F
> mov ebx, dword ptr [esp+14H] ; 010B _ 8B. 5C
> 24, 14
> mov ecx, dword ptr [esp+18H] ; 010F _ 8B. 4C
> 24, 18
> mov dword ptr [esp+38H], eax ; 0113 _ 89. 44
> 24, 38
> dec ebx ; 0117 _ 4B
> mov eax, dword ptr [esp+14H] ; 0118 _ 8B. 44
> 24, 14
> mov dword ptr [esp+14H], ebx ; 011C _ 89. 5C
> 24, 14
> add ecx, 4 ; 0120 _ 83. C1, 04
> mov ebx, dword ptr [esp+38H] ; 0123 _ 8B. 5C
> 24, 38
> mov dword ptr [esp+18H], ecx ; 0127 _ 89. 4C
> 24, 18
> dec ebx ; 012B _ 4B
> mov eax, dword ptr [esp+38H] ; 012C _ 8B. 44
> 24, 38
> mov dword ptr [esp+3CH], edx ; 0130 _ 89. 54
> 24, 3C
> mov ecx, edx ; 0134 _ 8B. CA
> mov dword ptr [esp+38H], ebx ; 0136 _ 89. 5C
> 24, 38
> mov dword ptr [esp+3CH], ecx ; 013A _ 89. 4C
> 24, 3C
> ?_025: cmp dword ptr [esp+38H], 0 ; 013E _ 83. 7C
> 24, 38, 00
> jne ?_021 ; 0143 _ 0F 85,
> FFFFFEFD
> ; Note: Immediate operand could be made smaller by sign extension
> jmp ?_023 ; 0149 _ E9,
> FFFFFF80
> _D3std9algorithm129__T8sortImplS793std10functional54__T13binaryFunImplVAyaa5_61203c2062VAyaa1_610889BE3F182A6EA5575A2D298A20D100
> ENDP
>
> On Fri, Jul 2, 2010 at 5:25 PM, David Simcha <dsimcha at gmail.com <mailto:dsimcha at gmail.com>> wrote:
>
> Sorry for the double paste. Please ignore everything before the
> second time you see
> _D3std9algorithm129__T8sortImplS793std10functional54__T13binaryFunImplVAyaa5_61203c2...
>
>
>
>
> ------------------------------------------------------------------------
>
> _______________________________________________
> phobos mailing list
> phobos at puremagic.com
> http://lists.puremagic.com/mailman/listinfo/phobos
|
July 02, 2010 [phobos] std.algorithm.sort slow as molasses | ||||
---|---|---|---|---|
| ||||
Posted in reply to Brad Roberts | Brad Roberts, el 2 de julio a las 13:43 me escribiste: > On Fri, 2 Jul 2010, Leandro Lucarella wrote: > > > See bug 859 http://d.puremagic.com/issues/show_bug.cgi?id=859 > > > > Is not a great bug report but there is a little test case. This is mostly D1 because of ranges, but using opApply with the current state of inlining is almost impossible for performance-aware applications. > > That one is on my radar already. I'm going to finish working on the migration of the dmd test suite before digging back into ticket, but this one is high on my list for what to tackle next. Thanks (for both =). > Is it directly impacting this specific case (I haven't looked at the code involved)? I don't know, I've hit this problem working in the GC, and as I said in this post, the (lack of) inlining is really showing: http://www.llucax.com.ar/blog/blog/post/-611dc288 I think my main problem was with opApply(), since the delegate passed to opApply() is used in a loop by definition, inlining that delegate is really important. Another reason to inline it (even when is a large and heavy function is it will be always an anonymous delegate (unless you call opApply() manually), so you are not even saving code size here. opApply() delegates should be inlined *always* IMHO, I don't know if it's easy to make a special case though. -- Leandro Lucarella (AKA luca) http://llucax.com.ar/ ---------------------------------------------------------------------- GPG Key: 5F5A8D05 (F8CD F9A7 BF00 5431 4145 104C 949E BFB6 5F5A 8D05) ---------------------------------------------------------------------- Wenn ist das nunst?ck git und slotermeyer? Ja! Beiherhund das oder die Flipperwaldt gersput! -- Monty Python (no leer si sab?s alem?n) |
July 02, 2010 [phobos] std.algorithm.sort slow as molasses | ||||
---|---|---|---|---|
| ||||
Posted in reply to David Simcha | On Friday, July 02, 2010 13:57:33 David Simcha wrote:
> bool evalPred(alias pred)(uint num) {
> // This line is to make sure that this function doesn't get inlined
> into // main. DMD doesn't inline functions that could throw.
> if(num == uint.max) {
> throw new Exception("");
> }
>
> return pred(num);
> }
DMD doesn't inline functions that could throw? Isn't that rather restrictive? I'm assuming that that's functions with an actual throw statement rather than all functions that aren't nothrow, since that would _really_ be restrictive. But still, you could have a really small function that threw if a condition failed but otherwise just returned a value, and I would have thought that that would be worth inlining. I'm far from an expert once you're dealing with stuff that low- level, so there could be a very good reason that functions that could throw aren't inlined, but it does strike me as rather restrictive if you can't inline such functions.
- Jonathan M Davis
|
July 02, 2010 [phobos] std.algorithm.sort slow as molasses | ||||
---|---|---|---|---|
| ||||
Posted in reply to Jonathan M Davis | See http://d.puremagic.com/issues/show_bug.cgi?id=3490 On 7/2/2010 6:10 PM, Jonathan M Davis wrote: > On Friday, July 02, 2010 13:57:33 David Simcha wrote: > > >> bool evalPred(alias pred)(uint num) { >> // This line is to make sure that this function doesn't get inlined >> into // main. DMD doesn't inline functions that could throw. >> if(num == uint.max) { >> throw new Exception(""); >> } >> >> return pred(num); >> } >> > DMD doesn't inline functions that could throw? Isn't that rather restrictive? I'm assuming that that's functions with an actual throw statement rather than all functions that aren't nothrow, since that would _really_ be restrictive. But still, you could have a really small function that threw if a condition failed but otherwise just returned a value, and I would have thought that that would be worth inlining. I'm far from an expert once you're dealing with stuff that low- level, so there could be a very good reason that functions that could throw aren't inlined, but it does strike me as rather restrictive if you can't inline such functions. > > - Jonathan M Davis > _______________________________________________ > phobos mailing list > phobos at puremagic.com > http://lists.puremagic.com/mailman/listinfo/phobos > > |
July 03, 2010 [phobos] std.algorithm.sort slow as molasses | ||||
---|---|---|---|---|
| ||||
Posted in reply to Jonathan M Davis | On 7/2/2010 6:10 PM, Jonathan M Davis wrote: > > DMD doesn't inline functions that could throw? Isn't that rather restrictive? I'm assuming that that's functions with an actual throw statement rather than all functions that aren't nothrow, since that would _really_ be restrictive. But still, you could have a really small function that threw if a condition failed but otherwise just returned a value, and I would have thought that that would be worth inlining. I'm far from an expert once you're dealing with stuff that low- level, so there could be a very good reason that functions that could throw aren't inlined, but it does strike me as rather restrictive if you can't inline such functions. > > - Jonathan M Davis > Yeah, I've started looking into this a little more. DMD will not inline anything with a throw statement, though it will inline functions that are not nothrow. However, in practice calling enforce() seems to prevent inlining of any function that uses enforce() because enforce() also has a lazy parameter, which isn't always visible because it has a default. For example: import std.contracts : enforce; void main(string[] args) { getNum(); } __gshared bool dummy = true; uint getNum() { enforce(dummy); return 1; } The fact that getNum() isn't inlined seems ridiculous at first, until you see the disassembly of it. enforce() is inlined because it doesn't contain a throw statement. std.contracts.bailOut() actually throws the exception. Here's the horribly complicated disassembly of getNum(): _D5test96getNumFZk PROC NEAR ; COMDEF _D5test96getNumFZk sub esp, 24 ; 0000 _ 83. EC, 18 xor eax, eax ; 0003 _ 31. C0 mov ecx, offset FLAT:_D5test96getNumFZk14__dgliteral610MFZAxa; 0005 _ B9, 00000000(segrel) push ebx ; 000A _ 53 mov dl, byte ptr [_D5test95dummyb] ; 000B _ 8A. 15, 00000000(segrel) xor dl, 01H ; 0011 _ 80. F2, 01 mov dword ptr [esp+4H], eax ; 0014 _ 89. 44 24, 04 mov dword ptr [esp+8H], ecx ; 0018 _ 89. 4C 24, 08 jz ?_007 ; 001C _ 74, 21 mov edx, ecx ; 001E _ 8B. D1 push dword ptr [?_003] ; 0020 _ FF. 35, 00000014(segrel) push dword ptr [?_002] ; 0026 _ FF. 35, 00000010(segrel) push 10 ; 002C _ 6A, 0A mov eax, dword ptr [esp+10H] ; 002E _ 8B. 44 24, 10 mov ebx, dword ptr [esp+10H] ; 0032 _ 8B. 5C 24, 10 call edx ; 0036 _ FF. D2 push edx ; 0038 _ 52 push eax ; 0039 _ 50 call _D3std9contracts7bailOutFAyaixAaZv ; 003A _ E8, 00000000(rel) ?_007: mov eax, 1 ; 003F _ B8, 00000001 pop ebx ; 0044 _ 5B add esp, 24 ; 0045 _ 83. C4, 18 ret ; 0048 _ C3 _D5test96getNumFZk ENDP |
Copyright © 1999-2021 by the D Language Foundation