Hello,
I'm working with CodeWarrior Version: 10.1.8 Build Id:158.
I've tried to write an optimize function to calculate: output = A' x A <==> output = Dot Product(A , A)
Word40 DOT_XX_40Bits(Word16 * restrict a_pV1, int N)
{
#pragma opt_level = "O3"
int i;
Word40 Results, Sum1 = X_extend(0), Sum2= X_extend(0);
int* pV32_1 = (int*)a_pV1;
int __SR__ = readSR(); setnosat();
cw_assert((int)a_pV1%8==0); cw_assert(N>=64);cw_assert( N%8 == 0 );
for(i=0; i<N/2; i+=2)
{
Sum1 = X_macd(Sum1, pV32_1[i+0], pV32_1[i+0]);
Sum2 = X_macd(Sum2, pV32_1[i+1], pV32_1[i+1]);
}
Sum1 = X_add (Sum1, Sum2);
Results = X_asr (Sum1);
writeSR(__SR__);
return Results;
}
The inner Assembly loop is as follows:
LOOPSTART3
[
macd d2,d2,d4
macd d3,d3,d6
move.2l (r0)+,d2:d3
]
LOOPEND3
50% efficiency!
I've tried to unroll the C loop (more):
for(i=0; i<N/2; i+=4)
{
Sum1 = X_macd(Sum1, pV32_1[i+0], pV32_1[i+0]);
Sum2 = X_macd(Sum2, pV32_1[i+1], pV32_1[i+1]);
Sum3 = X_macd(Sum3, pV32_1[i+3], pV32_1[i+3]);
Sum4 = X_macd(Sum4, pV32_1[i+4], pV32_1[i+4]);
}
But the result was worse (lack of register in inner loop!?).
- Note that the unrool pragma doesn't work (and I'll be glad to understand way?).
Please help me understand how should I do it right (100% efficiency in inner loop)?
Thanks,
Perry Shoham
已解决! 转到解答。
Hi Perry,
I think you have run into a so called "mixed optimization level" case.
In your project, in think, from command line (Project Settings) you have set the optimization level to O0 and from file, via pragma, you have set O3.
In this case the registers allocation is done by the high-level optimizer (-O0 command line option) and not by the low-level optimizer (valid only in -03 command line).
In order to fix this problem, please go to project properties or file properties (in case you want to specify only for the file that contains the function) and change the optimization level to O3.
In the same time you can comment the //#pragma opt_level = "O3" from the function body.
In this way you will obtain:
LOOPSTART3
DW10 TYPE debugsymbol
[
macd d0,d0,d9 ;[43,1] 1%=1 [0]
macd d1,d1,d4 ;[44,1] 1%=1 [0]
macd d2,d2,d5 ;[46,1] 1%=1 [0]
macd d3,d3,d8 ;[0,1] 1%=1 [0]
move.2l (r0)+n3,d0:d1 ;[43,1] 0%=0 [1]
move.2l (r1)+n3,d2:d3 ;[46,1] 0%=0 [1]
]
LOOPEND3
Hope this will help you.
Best regards,
Hello,
Have you tried to rewrite the loop in order to use 2 restrict pointers (pV32_1 and pV32_2) in order to compute Sum1 and Sum2?
In this case the compiler should generate 2 moves in parallel in the inner loop.
Best regards.
Hi,
Here the result:
LOOPSTART3
[
macd d10,d10,d6 ;[114,1] 2%=1 [0]
macd d11,d11,d8 ;[0,1] 2%=1 [0]
move.2l (r0)+n3,d10:d11 ;[112,1] 0%=0 [1]
]
[
macd d10,d10,d2 ;[112,1] 1%=0 [1]
macd d11,d11,d4 ;[113,1] 1%=0 [1]
move.2l (r1)+n3,d10:d11 ;[114,1] 1%=0 [1]
]
LOOPEND3
Here is the source code:
#define DOT_XX_40Bits_UNROOL_LOOP 4Word40 DOT_XX_40Bits(Word16 * restrict a_pV1, Word16 * restrict a_pV2, int N){#pragma opt_level = "O3"#pragma noinline int i; Word40 Results, Sum1, Sum2, Sum3, Sum4; int* pV32_1 = (int*)a_pV1; int* pV32_2 = (int*)a_pV2; int __SR__ = readSR(); setnosat(); Sum1 = X_extend(0); Sum2 = X_extend(0); Sum3 = X_extend(0); Sum4 = X_extend(0); cw_assert((int)a_pV1%8==0); cw_assert((int)a_pV2%8==0); cw_assert((int)pV32_1%8==0); cw_assert((int)pV32_2%8==0); cw_assert(N>=64); cw_assert( N%8 == 0 ); for(i=0; i<N/2; i += DOT_XX_40Bits_UNROOL_LOOP) { Sum1 = X_macd(Sum1, pV32_1[i+0], pV32_1[i+0]); Sum2 = X_macd(Sum2, pV32_1[i+1], pV32_1[i+1]);#if(DOT_XX_40Bits_UNROOL_LOOP==4) Sum3 = X_macd(Sum3, pV32_2[i+2], pV32_2[i+2]); Sum4 = X_macd(Sum4, pV32_2[i+3], pV32_2[i+3]);#endif }#if(DOT_XX_40Bits_UNROOL_LOOP==2) Sum1 = X_add (Sum1, Sum2); Results = X_asr (Sum1);#elif(DOT_XX_40Bits_UNROOL_LOOP==4) Sum1 = X_add (Sum1, Sum2); Sum3 = X_add (Sum3, Sum4); Sum1 = X_asr (Sum1); Sum3 = X_asr (Sum3); Results = X_add (Sum1, Sum3);#endif writeSR(__SR__); if( (_readEMR() & (1<<2)) ) stop(); return Results;}
Best regards,
Perry
Hi Perry,
I think you have run into a so called "mixed optimization level" case.
In your project, in think, from command line (Project Settings) you have set the optimization level to O0 and from file, via pragma, you have set O3.
In this case the registers allocation is done by the high-level optimizer (-O0 command line option) and not by the low-level optimizer (valid only in -03 command line).
In order to fix this problem, please go to project properties or file properties (in case you want to specify only for the file that contains the function) and change the optimization level to O3.
In the same time you can comment the //#pragma opt_level = "O3" from the function body.
In this way you will obtain:
LOOPSTART3
DW10 TYPE debugsymbol
[
macd d0,d0,d9 ;[43,1] 1%=1 [0]
macd d1,d1,d4 ;[44,1] 1%=1 [0]
macd d2,d2,d5 ;[46,1] 1%=1 [0]
macd d3,d3,d8 ;[0,1] 1%=1 [0]
move.2l (r0)+n3,d0:d1 ;[43,1] 0%=0 [1]
move.2l (r1)+n3,d2:d3 ;[46,1] 0%=0 [1]
]
LOOPEND3
Hope this will help you.
Best regards,