Specific 'Bit Banding', as per that ARM document, IS enabled in (at least) the Kinetis 'K' M4 devices. Pretty easy to use on peripherals; there is even a macro for it in the Kinetis headers as 'BITBAND_REG'. It is a little harder to use in RAM, mostly because the tools (compilers and linkers) won't help! But here is an example for a graphics display RAM, in this case using the least-significant-byte of each 32-bit word as screen data (so that the upper bits can be preset for a SPI DMA refresh).
So, you have to pre-allocate the space yourself (fixed location, subtracted from the linker-space):
#pragma location=0x2000EF00
__no_init u32_t Display_RAM[Y_PITCH*8]; //Pre-allocated fixed-space for Display RAM
//Necessary, in SRAM-U, to use bit-banding!
Then the SRAM bit-band macro is pretty similar to theirs:
#define BITBAND_SRAM_A(Addr,Bit) (0x22000000u + (((u32_t)(Addr) - (u32_t)0x20000000u)<<5) + (((u32_t)(Bit))<<2))
and here is a line-drawing process using that bit-banding to avoid detailed RMW of display RAM bits, preferring instead to just write each single bit
///Draw a line from (xstrt,ystrt) to (xfin,yfin) using 32bit dashing 'pattern'
// modified Bresenham algorithm, using ARM M4 'bit banding' RAM access to allow direct bit-writes (no RMW)
#define Y_PITCH 132
void OLED_Line(u16_t xstrt, u16_t ystrt, u16_t xfin, u16_t yfin, u32_t pattern)
{
s32_t eterm=0; //Error-term from fractional accumulations
///Some kind of 'set to half' should improve centering the steps
u16_t length; //# of dots to draw
s32_t diffx=xfin-xstrt;
s32_t diffy=yfin-ystrt;
u32_t _PTR_ offset;
s16_t stepx=1<<5; //Byte increment point for bit-band address
u16_t ybit;
if( xstrt > 127) xstrt = 127; //A little sanity checking!
if( xfin > 127) xfin = 127;
if( ystrt > 63) ystrt = 63;
if( yfin > 63) yfin = 63;
ybit = ystrt&7;
offset = &Display_RAM[xstrt];
offset += (ystrt>>3)*Y_PITCH; //8 bits of pixels per page, so add the page offset
offset = (u32_t _PTR_)BITBAND_SRAM_A(offset,ybit); //Convert to bit-band address, adding-in bit offset within byte for pixel row
diffx=xfin-xstrt;
diffy=yfin-ystrt;
if (diffx<0)
{
stepx=-stepx;
diffx=-diffx;
}
if (diffy<0)
{
diffy=-diffy;
if (diffx>diffy) //'mostly horizontal', move integer x, fractional y
{
length=diffx+1;
// eterm = diffy/2;
for (u16_t i=0;i<length;i++)
{
register u32_t tmp;
*offset=pattern; //LSB-only is written
tmp = (pattern&1)<<31; //Compiler is SMART -- makes a proper ROR out of this pair!!!
pattern = (pattern>>1) | tmp;
offset+=stepx;
eterm+=diffy;
if (eterm>diffx)
{
eterm-=diffx;
offset-=1;//<<2; //Each step in y moves to the next bit, BUT once we underflow the byte
ybit = (ybit-1)&7; // where bitcount in bitband address is *4
if(ybit == 7) // we must move to the LSB or MSB of the next row up/down.
offset -= (Y_PITCH<<5)-(8);//<<2); //To next bitband byte location, less the 8-bit-count already done
}
}
}
else //'mostly vertical', move integer y, fractional x
{
length=diffy+1;
// eterm = diffx/2;
for (u16_t i=0;i<length;i++)
{
register u32_t tmp;
*offset=pattern; //LSB-only is written
tmp = (pattern&1)<<31;
pattern = (pattern>>1) | tmp;
offset-=1;//<<2; //Each step in y moves to the next bit, BUT once we underflow the byte
ybit = (ybit-1)&7; // where bitcount in bitband address is *4
if(ybit == 7) // we must move to the LSB or MSB of the next row up/down.
offset -= (Y_PITCH<<5)-(8);//<<2); //To next bitband byte location, less the 8-bit-count already done
eterm+=diffx;
if (eterm>diffy)
{
eterm-=diffy;
offset+=stepx;
}
}
}
}else
{ //Positive Y-direction:
if (diffx>diffy) //'mostly horizontal', move integer x, fractional y
{
length=diffx+1;
// eterm = diffy/2;
for (u16_t i=0;i<length;i++)
{
register u32_t tmp;
*offset=pattern; //LSB-only is written
tmp = (pattern&1)<<31;
pattern = (pattern>>1) | tmp;
offset+=stepx;
eterm+=diffy;
if (eterm>diffx)
{
eterm-=diffx;
// Note: bitcount in bitband address is *4, but since 'offset' is a u32 pointer, this is accounted for
offset+=1;//<<2; //Each step in y moves to the next bit, BUT once we overflow the byte
ybit = (ybit+1)&7;
if(ybit == 0) // we must move to the LSB or MSB of the next row up/down.
offset += (Y_PITCH<<5)-(8);//<<2); //To next bitband byte location, less the 8-bit-count already done
}
}
}
else //'mostly vertical', move integer y, fractional x
{
length=diffy+1;
// eterm = diffx/2;
for (u16_t i=0;i<length;i++)
{
register u32_t tmp;
*offset=pattern; //LSB-only is written
tmp = (pattern&1)<<31;
pattern = (pattern>>1) | tmp; //Rotate-right 1
offset+=1;//<<2; //Each step in y moves to the next bit, BUT once we overflow the byte
ybit = (ybit+1)&7; // where bitcount in bitband address is *4
if(ybit == 0) // we must move to the LSB or MSB of the next row up/down.
offset += (Y_PITCH<<5)-(8);//<<2); //To next bitband byte location, less the 8-bit-count already done
eterm+=diffx;
if (eterm>diffy)
{
eterm-=diffy;
offset+=stepx;
}
}
}
}
}