Thank you for your help. I wasn't sure if anyone would respond to my
message. I'm trying to give the compiler the maximum amount of information
so it can inline the function at the call site. I'm normally not too
concerned with performance but this type of code will be used in an
application that typically has a long runtime. Here's an example. In one
of my test cases, the main search function gets called approx. 500 billons
times, so any optimizations that I can find will greatly lowers the runtime.
The compiler seems to be generating code for a generic Comp function (not
specialized for sz==4). Because this type of code will be literally called
more than a trillion times, I think having an inlined version at the call
site would be faster. I tried the Microsoft compiler and it does inline the
function. I'm not pushing to use the Microsoft compiler but I'm wondering
if I can get the Borland compiler to do the same. I would like to stick
with one compiler (the Borland one).
#include <stdio.h>
template <unsigned int T>
struct Test {
unsigned int buf[T];
public:
inline unsigned int Comp ( unsigned int sz = T ) {
unsigned int ret=0;
switch (sz) {
case 4: ret |= buf[3];
case 3: ret |= buf[2];
case 2: ret |= buf[1];
case 1: ret |= buf[0]; break;
default: return buf[sz-1] | Comp(sz-1);
}
return ret;
}
};
int main(int argc, char* argv[]) {
Test<4> test;
printf("Hello, %d", test.Comp());
return 0;
}
//==========================================================
//================== Borland Compiler ======================
//==========================================================
00401200 /. 55 PUSH EBP ; int
main(int argc, char* argv[])
00401201 |. 8BEC MOV EBP,ESP
00401203 |. 83C4 F0 ADD ESP,-10
00401206 |. 6A 04 PUSH 4 ; /Arg2
= 00000004
00401208 |. 8D45 F0 LEA EAX,DWORD PTR SS:[EBP-10] ; |
0040120B |. 50 PUSH EAX ; |Arg1
0040120C |. E8 17000000 CALL Project2.00401228 ;
\Project2.00401228
00401211 |. 83C4 08 ADD ESP,8
00401214 |. 50 PUSH EAX ; /Arg2
00401215 |. 68 ACB04000 PUSH Project2.0040B0AC ; |Arg1
= 0040B0AC ASCII "Hello, %d"
0040121A |. E8 D52F0000 CALL Project2.004041F4 ;
\Project2.004041F4
0040121F |. 83C4 08 ADD ESP,8
00401222 |. 33C0 XOR EAX,EAX
00401224 |. 8BE5 MOV ESP,EBP
00401226 |. 5D POP EBP
00401227 \. C3 RETN
00401228 /$ 55 PUSH EBP ;
Test<2>::Comp()
00401229 |. 8BEC MOV EBP,ESP
0040122B |. 53 PUSH EBX
0040122C |. 56 PUSH ESI
0040122D |. 8B75 0C MOV ESI,DWORD PTR SS:[EBP+C]
00401230 |. 8B5D 08 MOV EBX,DWORD PTR SS:[EBP+8]
00401233 |. 33C0 XOR EAX,EAX
00401235 |. 8BD6 MOV EDX,ESI
00401237 |. 4A DEC EDX ;
Switch (cases 1..4)
00401238 |. 74 12 JE SHORT Project2.0040124C
0040123A |. 4A DEC EDX
0040123B |. 74 0C JE SHORT Project2.00401249
0040123D |. 4A DEC EDX
0040123E |. 74 06 JE SHORT Project2.00401246
00401240 |. 4A DEC EDX
00401241 |. 75 0D JNZ SHORT Project2.00401250
00401243 |. 0B43 0C OR EAX,DWORD PTR DS:[EBX+C] ; Case
4 of switch 00401237
00401246 |> 0B43 08 OR EAX,DWORD PTR DS:[EBX+8] ; Case
3 of switch 00401237
00401249 |> 0B43 04 OR EAX,DWORD PTR DS:[EBX+4] ; Case
2 of switch 00401237
0040124C |> 0B03 OR EAX,DWORD PTR DS:[EBX] ; Case
1 of switch 00401237
0040124E |. EB 11 JMP SHORT Project2.00401261
00401250 |> 8BCE MOV ECX,ESI ;
Default case of switch 00401237
00401252 |. 49 DEC ECX
00401253 |. 51 PUSH ECX ; /Arg2
00401254 |. 53 PUSH EBX ; |Arg1
00401255 |. E8 CEFFFFFF CALL Project2.00401228 ;
\Project2.00401228
0040125A |. 83C4 08 ADD ESP,8
0040125D |. 0B44B3 FC OR EAX,DWORD PTR DS:[EBX+ESI*4-4]
00401261 |> 5E POP ESI
00401262 |. 5B POP EBX
00401263 |. 5D POP EBP
00401264 \. C3 RETN
//==========================================================
//================== Microsoft Compiler ====================
//==========================================================
004017E0 /$ 8B4424 FC MOV EAX,DWORD PTR SS:[ESP-4]
004017E4 |. 0B4424 F8 OR EAX,DWORD PTR SS:[ESP-8]
004017E8 |. 8B4C24 F0 MOV ECX,DWORD PTR SS:[ESP-10]
004017EC |. 0B4424 F4 OR EAX,DWORD PTR SS:[ESP-C]
004017F0 |. 83EC 10 SUB ESP,10
004017F3 |. 0BC8 OR ECX,EAX
004017F5 |. 51 PUSH ECX ; /<%d>
004017F6 |. 68 F4204000 PUSH test.004020F4 ;
|format = "Hello, %d"
004017FB |. FF15 A0204000 CALL DWORD PTR DS:[<&MSVCR80.printf>] ;
\printf
00401801 |. 33C0 XOR EAX,EAX
00401803 |. 83C4 18 ADD ESP,18
00401806 \. C3 RETN
Post by unknownPost by Mike Kingif (sz) return Comp(sz-1) | buf[sz];
else return buf[0];
If you don't want recursion, use while() or for().
switch( sz )
return buf[0];
case 1;
return buf[0] | buf[1];
return Comp(this, sz-2) | buf[sz-1] | buf[sz];
};
Which is pretty much optimized for your Test<2>
For the compiler to remove the "deafult:" code,
you would need a mechanism to assure the compiler
that it can never be called with a value larger than T-1.
Ahh, perhaps?
if( sz > T-1 ) return 0;
{ return Comp(sz-1) | buf[sz];
Post by Mike King}else{ return buf[0];
int ret =0;
switch( sz )
ret |= buf[T-1];
ret |= buf[T-2];
};
return ret;
Or, in general
int ret =0;
switch( sz )
{
<expand while T > 0 >
ret |= buf[sz--];
<end macro expansion>
default: // handle too big sz
};
return ret;
int ret = buf[0];
switch( sz )
{
<expand while T > 1 >
ret |= buf[sz--];
<end macro expansion>
};
return ret;
Now, how you do that <expand>, is up to the template gurus to explain.