On Wed, 2008-04-16 at 09:19 +0200, Christian Schoenebeck wrote:
But if you're totally sceptical, ...
Yes, it bothers me if the compiler can beat "hand-optimized" ASM :-D
It turns out that I get different results on consecutive runs of the
same binary, look:
Benchmarking mixdown (WITH coeff):
pure C++ : 380 ms
ASM SSE : 150 ms
GCC vector extensions : 140 ms <-- best
Benchmarking mixdown (WITH coeff):
pure C++ : 390 ms
ASM SSE : 140 ms <-- best
GCC vector extensions : 160 ms
This with slightly unrolled ASM to match the --funroll-loops option:
--8<-----------------------------------------
x86_sse_mix_buffers_with_gain --
...
.MBWG_SSE:
cmp $8, %ecx #; we know it's not zero, but if it's not >=8, then
jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
#; copy gain to fill %xmm1
movss 20(%ebp), %xmm1
shufps $0x00, %xmm1, %xmm1
.MBWG_SSELOOP:
movaps (%esi), %xmm0 #; source => xmm0
addl $16, %esi #; src+=4
movaps (%esi), %xmm2 #; ++source => xmm2
addl $16, %esi #; src+=4
mulps %xmm1, %xmm0 #; apply gain to source
mulps %xmm1, %xmm2 #; apply gain to source
addps (%edi), %xmm0 #; mix with destination
movaps %xmm0, (%edi) #; copy result to destination
addl $16, %edi #; dst+=4
addps (%edi), %xmm2 #; mix with destination
movaps %xmm2, (%edi) #; copy result to destination
addl $16, %edi #; dst+=4
subl $8, %ecx #; nframes-=8
cmp $8, %ecx
jge .MBWG_SSELOOP
cmp $0, %ecx
je .MBWG_END
#; if there are remaining frames, the nonalign code will do
nicely
#; for the rest 1-7 frames.
...