Jussi!
Could you try this out with your proposed compiler options on your own
hardware?
Admittedly, the recycled PIII here is very unrepresentative, outdated
and old-skool (although it seems to shine when paired up with icc :)
--8<-----------------------------
// include everything just in case we need it ...
#include <unistd.h>
#include <stdio.h>
#include <sched.h>
#include <time.h>
#include <stdlib.h>
#define N 1024
#include <complex.h>
float // complex
ffta[N][2] __attribute__ ((aligned(16))),
fftb[N][2] __attribute__ ((aligned(16))),
data[N][2] __attribute__ ((aligned(16)));
_Complex float
cxA[N] __attribute__ ((aligned(16))),
cxB[N] __attribute__ ((aligned(16))),
cxD[N] __attribute__ ((aligned(16))) ;
typedef struct
{
float r[N] __attribute__ ((aligned(16)));
float i[N] __attribute__ ((aligned(16)));
} cvec_t;
cvec_t cA,cB,cD;
int main()
{
int n = 1000000;
int i,j;
char* s;
clock_t clk = clock();
s = "(_Complex)";
for (j = 0; j < n; ++j)
for (i = 0;i < N; ++i)
cxD[i]+= cxA[i]*cxB[i];
fprintf (stderr,"> clock: %d ms %s\n",(clock()-clk)/1000,s);
s = "(cvec_t)";
clk = clock();
for (j = 0; j < n; ++j)
for (i = 0;i < N; ++i)
{
cD.r[i] += cA.r[i] * cB.r[i] - cA.i[i] * cB.i[i];
cD.i[i] += cA.r[i] * cB.i[i] + cA.i[i] * cB.r[i];
}
fprintf (stderr,"> clock: %d ms %s\n",(clock()-clk)/1000,s);
s = "(original float array[N][2])";
clk = clock();
for (j = 0; j < n ; ++j)
for (i = 0; i <N; ++i)
{
data [i][0] += ffta [i][0] * fftb [i][0] - ffta [i][1] * fftb [i][1];
data [i][1] += ffta [i][0] * fftb [i][1] + ffta [i][1] * fftb [i][0];
}
fprintf (stderr,"> clock: %d ms %s\n",(clock()-clk)/1000,s);
return 0;
}
On Mon, 2008-05-05 at 19:15 +0300, Jussi Laako wrote:
Jussi Laako wrote:
I would propose something like
"-march=prescott -O3 -ftree-vectorize" or
"-O3 -sse3 -ftree-vectorize".
Sorry, typo, "-O3 -msse3 -ftree-vectorize" of course...
- Jussi
--