Is there a preferred integer size? I have a program with a lot of 8 bit data. Is there an advantage to switching over to 32 bit integers? Specifically I want to know if there is penalty for 8bit verses 32 bit access? Recently I have heard the new ARM parts do not have a penalty for accessing 8bit data.
Thanks
Solved! Go to Solution.
While there may be no 'penalty for access' to 8/16-bit (as you can see below, where the PktCnts is 16 bytes and PktRSSI is 16 half-words), what Cortex M4 doesn't seem to have are any 8/16-bit MATH operations (ignoring the SIMD instructions...). That is to say, for any address-operation/compare/math, the number needs to be a 32bit 'full native' quantity. Often, values declared as a 'smaller size' incur the penalty of S/UXTB or S/UXTH instructions to extend to 32bits for any such usage. You can see that the 'load' of bytes from the arrays for some math on them are 'directly extended' by the LDRB.W construct (and LDRH.W and STRH.W for half-words). So, as you might see, 'it depends on what you are doing' whether there is a performance-hit for <32bits. For counters, I do like the '(u)int_fast8_t' for portability, but for IAR at least that equates to uint32_t, as per these loop examples, where R4 is my counter 'i':
uint_fast8_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c08: 0x2010 MOVS R0, #16 ; 0x10
0x8c0a: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c0c: 0x2c00 CMP R4, #0
0x8c0e: 0xd017 BEQ.N ??RFTestMain_46 ; 0x8c40
if( PktCnts[i-1] != 0 )
0x8c10: 0x4897 LDR.N R0, ??DataTable4_10 ; PktCnts
0x8c12: 0x1820 ADDS R0, R4, R0
0x8c14: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c18: 0x2800 CMP R0, #0
0x8c1a: 0xd00f BEQ.N ??RFTestMain_47 ; 0x8c3c
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c1c: 0x4893 LDR.N R0, ??DataTable4_9 ; PktRSSI
0x8c1e: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c22: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c26: 0x4992 LDR.N R1, ??DataTable4_10 ; PktCnts
0x8c28: 0x1861 ADDS R1, R4, R1
0x8c2a: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c2e: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c32: 0x498e LDR.N R1, ??DataTable4_9 ; PktRSSI
0x8c34: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c38: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c3c: 0x1e64 SUBS R4, R4, #1
0x8c3e: 0xe7e5 B.N ??RFTestMain_45 ; 0x8c0c
int32_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c08: 0x2010 MOVS R0, #16 ; 0x10
0x8c0a: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c0c: 0x2c01 CMP R4, #1
0x8c0e: 0xdb17 BLT.N ??RFTestMain_46 ; 0x8c40
if( PktCnts[i-1] != 0 )
0x8c10: 0x4897 LDR.N R0, ??DataTable4_10 ; PktCnts
0x8c12: 0x1820 ADDS R0, R4, R0
0x8c14: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c18: 0x2800 CMP R0, #0
0x8c1a: 0xd00f BEQ.N ??RFTestMain_47 ; 0x8c3c
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c1c: 0x4893 LDR.N R0, ??DataTable4_9 ; PktRSSI
0x8c1e: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c22: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c26: 0x4992 LDR.N R1, ??DataTable4_10 ; PktCnts
0x8c28: 0x1861 ADDS R1, R4, R1
0x8c2a: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c2e: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c32: 0x498e LDR.N R1, ??DataTable4_9 ; PktRSSI
0x8c34: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c38: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c3c: 0x1e64 SUBS R4, R4, #1
0x8c3e: 0xe7e5 B.N ??RFTestMain_45 ; 0x8c0c
uint16_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c1c: 0x2010 MOVS R0, #16 ; 0x10
0x8c1e: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c20: 0xb2a4 UXTH R4, R4
0x8c22: 0x2c01 CMP R4, #1
0x8c24: 0xd31b BCC.N ??RFTestMain_46 ; 0x8c5e
if( PktCnts[i-1] != 0 )
0x8c26: 0xb2a4 UXTH R4, R4
0x8c28: 0x48a4 LDR.N R0, ??DataTable4_6 ; PktCnts
0x8c2a: 0x1820 ADDS R0, R4, R0
0x8c2c: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c30: 0x2800 CMP R0, #0
0x8c32: 0xd012 BEQ.N ??RFTestMain_47 ; 0x8c5a
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c34: 0xb2a4 UXTH R4, R4
0x8c36: 0x48a0 LDR.N R0, ??DataTable4_5 ; PktRSSI
0x8c38: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c3c: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c40: 0xb2a4 UXTH R4, R4
0x8c42: 0x499e LDR.N R1, ??DataTable4_6 ; PktCnts
0x8c44: 0x1861 ADDS R1, R4, R1
0x8c46: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c4a: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c4e: 0xb2a4 UXTH R4, R4
0x8c50: 0x4999 LDR.N R1, ??DataTable4_5 ; PktRSSI
0x8c52: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c56: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c5a: 0x1e64 SUBS R4, R4, #1
0x8c5c: 0xe7e0 B.N ??RFTestMain_45 ; 0x8c20
int8_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c1c: 0x2010 MOVS R0, #16 ; 0x10
0x8c1e: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c20: 0xb264 SXTB R4, R4
0x8c22: 0x2c01 CMP R4, #1
0x8c24: 0xdb1b BLT.N ??RFTestMain_46 ; 0x8c5e
if( PktCnts[i-1] != 0 )
0x8c26: 0xb264 SXTB R4, R4
0x8c28: 0x48a4 LDR.N R0, ??DataTable4_6 ; PktCnts
0x8c2a: 0x1820 ADDS R0, R4, R0
0x8c2c: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c30: 0x2800 CMP R0, #0
0x8c32: 0xd012 BEQ.N ??RFTestMain_47 ; 0x8c5a
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c34: 0xb264 SXTB R4, R4
0x8c36: 0x48a0 LDR.N R0, ??DataTable4_5 ; PktRSSI
0x8c38: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c3c: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c40: 0xb264 SXTB R4, R4
0x8c42: 0x499e LDR.N R1, ??DataTable4_6 ; PktCnts
0x8c44: 0x1861 ADDS R1, R4, R1
0x8c46: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c4a: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c4e: 0xb264 SXTB R4, R4
0x8c50: 0x4999 LDR.N R1, ??DataTable4_5 ; PktRSSI
0x8c52: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c56: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c5a: 0x1e64 SUBS R4, R4, #1
0x8c5c: 0xe7e0 B.N ??RFTestMain_45 ; 0x8c20
Of course the code-result looks 'much better' with some optimization turned on! 'Maximum speed' simply unrolls the loop, while 'minimum size' gets this loop result, where R0 is now 'i', and the array 'static addresses' are pre-loaded outside the loop in R7:
uint32_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x6834: 0x2010 MOVS R0, #16 ; 0x10
??RFTestMain_40:
0x6836: 0x19c1 ADDS R1, R0, R7
0x6838: 0x7bc9 LDRB R1, [R1, #0xf]
0x683a: 0xb129 CBZ R1, ??RFTestMain_41 ; 0x6848
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x683c: 0xeb07 0x0240 ADD.W R2, R7, R0, LSL #1
0x6840: 0x8c53 LDRH R3, [R2, #0x22]
0x6842: 0xfb93 0xf1f1 SDIV R1, R3, R1
0x6846: 0x8451 STRH R1, [R2, #0x22]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_41:
0x6848: 0x1e40 SUBS R0, R0, #1
for(i=sizeof(PktCnts);i>0;i--)
0x684a: 0xd1f4 BNE.N ??RFTestMain_40 ; 0x6836
While there may be no 'penalty for access' to 8/16-bit (as you can see below, where the PktCnts is 16 bytes and PktRSSI is 16 half-words), what Cortex M4 doesn't seem to have are any 8/16-bit MATH operations (ignoring the SIMD instructions...). That is to say, for any address-operation/compare/math, the number needs to be a 32bit 'full native' quantity. Often, values declared as a 'smaller size' incur the penalty of S/UXTB or S/UXTH instructions to extend to 32bits for any such usage. You can see that the 'load' of bytes from the arrays for some math on them are 'directly extended' by the LDRB.W construct (and LDRH.W and STRH.W for half-words). So, as you might see, 'it depends on what you are doing' whether there is a performance-hit for <32bits. For counters, I do like the '(u)int_fast8_t' for portability, but for IAR at least that equates to uint32_t, as per these loop examples, where R4 is my counter 'i':
uint_fast8_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c08: 0x2010 MOVS R0, #16 ; 0x10
0x8c0a: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c0c: 0x2c00 CMP R4, #0
0x8c0e: 0xd017 BEQ.N ??RFTestMain_46 ; 0x8c40
if( PktCnts[i-1] != 0 )
0x8c10: 0x4897 LDR.N R0, ??DataTable4_10 ; PktCnts
0x8c12: 0x1820 ADDS R0, R4, R0
0x8c14: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c18: 0x2800 CMP R0, #0
0x8c1a: 0xd00f BEQ.N ??RFTestMain_47 ; 0x8c3c
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c1c: 0x4893 LDR.N R0, ??DataTable4_9 ; PktRSSI
0x8c1e: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c22: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c26: 0x4992 LDR.N R1, ??DataTable4_10 ; PktCnts
0x8c28: 0x1861 ADDS R1, R4, R1
0x8c2a: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c2e: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c32: 0x498e LDR.N R1, ??DataTable4_9 ; PktRSSI
0x8c34: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c38: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c3c: 0x1e64 SUBS R4, R4, #1
0x8c3e: 0xe7e5 B.N ??RFTestMain_45 ; 0x8c0c
int32_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c08: 0x2010 MOVS R0, #16 ; 0x10
0x8c0a: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c0c: 0x2c01 CMP R4, #1
0x8c0e: 0xdb17 BLT.N ??RFTestMain_46 ; 0x8c40
if( PktCnts[i-1] != 0 )
0x8c10: 0x4897 LDR.N R0, ??DataTable4_10 ; PktCnts
0x8c12: 0x1820 ADDS R0, R4, R0
0x8c14: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c18: 0x2800 CMP R0, #0
0x8c1a: 0xd00f BEQ.N ??RFTestMain_47 ; 0x8c3c
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c1c: 0x4893 LDR.N R0, ??DataTable4_9 ; PktRSSI
0x8c1e: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c22: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c26: 0x4992 LDR.N R1, ??DataTable4_10 ; PktCnts
0x8c28: 0x1861 ADDS R1, R4, R1
0x8c2a: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c2e: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c32: 0x498e LDR.N R1, ??DataTable4_9 ; PktRSSI
0x8c34: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c38: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c3c: 0x1e64 SUBS R4, R4, #1
0x8c3e: 0xe7e5 B.N ??RFTestMain_45 ; 0x8c0c
uint16_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c1c: 0x2010 MOVS R0, #16 ; 0x10
0x8c1e: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c20: 0xb2a4 UXTH R4, R4
0x8c22: 0x2c01 CMP R4, #1
0x8c24: 0xd31b BCC.N ??RFTestMain_46 ; 0x8c5e
if( PktCnts[i-1] != 0 )
0x8c26: 0xb2a4 UXTH R4, R4
0x8c28: 0x48a4 LDR.N R0, ??DataTable4_6 ; PktCnts
0x8c2a: 0x1820 ADDS R0, R4, R0
0x8c2c: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c30: 0x2800 CMP R0, #0
0x8c32: 0xd012 BEQ.N ??RFTestMain_47 ; 0x8c5a
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c34: 0xb2a4 UXTH R4, R4
0x8c36: 0x48a0 LDR.N R0, ??DataTable4_5 ; PktRSSI
0x8c38: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c3c: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c40: 0xb2a4 UXTH R4, R4
0x8c42: 0x499e LDR.N R1, ??DataTable4_6 ; PktCnts
0x8c44: 0x1861 ADDS R1, R4, R1
0x8c46: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c4a: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c4e: 0xb2a4 UXTH R4, R4
0x8c50: 0x4999 LDR.N R1, ??DataTable4_5 ; PktRSSI
0x8c52: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c56: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c5a: 0x1e64 SUBS R4, R4, #1
0x8c5c: 0xe7e0 B.N ??RFTestMain_45 ; 0x8c20
int8_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x8c1c: 0x2010 MOVS R0, #16 ; 0x10
0x8c1e: 0x0004 MOVS R4, R0
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_45:
0x8c20: 0xb264 SXTB R4, R4
0x8c22: 0x2c01 CMP R4, #1
0x8c24: 0xdb1b BLT.N ??RFTestMain_46 ; 0x8c5e
if( PktCnts[i-1] != 0 )
0x8c26: 0xb264 SXTB R4, R4
0x8c28: 0x48a4 LDR.N R0, ??DataTable4_6 ; PktCnts
0x8c2a: 0x1820 ADDS R0, R4, R0
0x8c2c: 0xf810 0x0c01 LDRB.W R0, [R0, #-0x1]
0x8c30: 0x2800 CMP R0, #0
0x8c32: 0xd012 BEQ.N ??RFTestMain_47 ; 0x8c5a
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x8c34: 0xb264 SXTB R4, R4
0x8c36: 0x48a0 LDR.N R0, ??DataTable4_5 ; PktRSSI
0x8c38: 0xeb10 0x0044 ADDS.W R0, R0, R4, LSL #1
0x8c3c: 0xf830 0x0c02 LDRH.W R0, [R0, #-0x2]
0x8c40: 0xb264 SXTB R4, R4
0x8c42: 0x499e LDR.N R1, ??DataTable4_6 ; PktCnts
0x8c44: 0x1861 ADDS R1, R4, R1
0x8c46: 0xf811 0x1c01 LDRB.W R1, [R1, #-0x1]
0x8c4a: 0xfb90 0xf0f1 SDIV R0, R0, R1
0x8c4e: 0xb264 SXTB R4, R4
0x8c50: 0x4999 LDR.N R1, ??DataTable4_5 ; PktRSSI
0x8c52: 0xeb11 0x0144 ADDS.W R1, R1, R4, LSL #1
0x8c56: 0xf821 0x0c02 STRH.W R0, [R1, #-0x2]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_47:
0x8c5a: 0x1e64 SUBS R4, R4, #1
0x8c5c: 0xe7e0 B.N ??RFTestMain_45 ; 0x8c20
Of course the code-result looks 'much better' with some optimization turned on! 'Maximum speed' simply unrolls the loop, while 'minimum size' gets this loop result, where R0 is now 'i', and the array 'static addresses' are pre-loaded outside the loop in R7:
uint32_t i;
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_39:
0x6834: 0x2010 MOVS R0, #16 ; 0x10
??RFTestMain_40:
0x6836: 0x19c1 ADDS R1, R0, R7
0x6838: 0x7bc9 LDRB R1, [R1, #0xf]
0x683a: 0xb129 CBZ R1, ??RFTestMain_41 ; 0x6848
PktRSSI[i-1] = PktRSSI[i-1]/PktCnts[i-1];
0x683c: 0xeb07 0x0240 ADD.W R2, R7, R0, LSL #1
0x6840: 0x8c53 LDRH R3, [R2, #0x22]
0x6842: 0xfb93 0xf1f1 SDIV R1, R3, R1
0x6846: 0x8451 STRH R1, [R2, #0x22]
for(i=sizeof(PktCnts);i>0;i--)
??RFTestMain_41:
0x6848: 0x1e40 SUBS R0, R0, #1
for(i=sizeof(PktCnts);i>0;i--)
0x684a: 0xd1f4 BNE.N ??RFTestMain_40 ; 0x6836
In theory #include <stdint.h> then use (u)int_fast8_t as the type and your compiler will do the correct thing for the given architecture.
See: <stdint.h> documentation.
Alas sometime the theory fails. Also depends how how much RAM you have and how much you can stand to waste using 32 bits to store 8.
Always trade offs.. :-(