Negative effect of enabling Prefetcher for XIP from NOR flash memory

k_farhangi · ‎04-01-2020

Using an IMXRT1062 on an IMXRT1060 Evaluation kit, I'm trying to run the code from a NOR flash memory (XIP). I noticed if I disable the prefetcher (AHBCR->PREFETCHEN = 0), the code runs slightly faster (about 12% to be exact). I am a bit confused as I expect the opposite for disabling the prefetch. So can someone explain in what conditions can this happen or what am I doing wrong?

The flash memory configuration is as follows:

static const FlexSPINorConfig __attribute__((section(".boot_hdr.conf"), used)) g_flash_config =
{
    .memConfig = {
        .tag                  = FLEXSPI_CFG_BLK_TAG,
        .version              = FLEXSPI_CFG_BLK_VERSION,
        .readSampleClkSrc     = kFlexSPIReadSampleClk_LoopbackFromDqsPad,
        .csHoldTime           = 3u,
        .csSetupTime          = 3u,
        .columnAddressWidth   = 0u,
        .configCmdEnable      = 0u,
        .controllerMiscOption = (1 << kFlexSpiMiscOffset_SafeConfigFreqEnable),
        .deviceType           = kFlexSpiDeviceType_SerialNOR,
        .sflashPadType        = kSerialFlash_4Pads,
        .serialClkFreq        = kFlexSpiSerialClk_133MHz,
        .lutCustomSeqEnable   = 0u,
        .sflashA1Size         = 0x00800000u, /* 8MB/64Mbit */
        .lookupTable =
            {
                // 0 Fast read sequence
                [0] = FLEXSPI_LUT_SEQ(CMD_SDR, FLEXSPI_1PAD, 0xEB, RADDR_SDR, FLEXSPI_4PAD,0x18),
                [1] = FLEXSPI_LUT_SEQ(DUMMY_SDR, FLEXSPI_4PAD, 0x06, READ_SDR, FLEXSPI_4PAD, 0x04),
                [2] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [3] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 1 Read Status sequence
                [4] = FLEXSPI_LUT_SEQ(CMD_SDR, FLEXSPI_1PAD, 0x05, READ_SDR, FLEXSPI_1PAD, 0x04),
                [5] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [6] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [7] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 2 Reserved
                [8]  = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [9]  = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [10] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [11] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 3 Write Enable sequence
                [12] = FLEXSPI_LUT_SEQ(CMD_SDR, FLEXSPI_1PAD, 0x06, STOP, 0, 0),
                [13] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [14] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [15] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 4 Reserved
                [16] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [17] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [18] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [19] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 5 erase sector sequence
                [20] = FLEXSPI_LUT_SEQ(CMD_SDR, FLEXSPI_1PAD, 0x20, RADDR_SDR, FLEXSPI_1PAD, 0x18),
                [21] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [22] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [23] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 6 Reserved
                [24] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [25] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [26] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [27] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 7 Reserved
                [28] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [29] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [30] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [31] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 8 Reserved
                [32] = FLEXSPI_LUT_SEQ(CMD_SDR, FLEXSPI_1PAD, 0xD8, RADDR_SDR, FLEXSPI_1PAD, 0x18),
                [33] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [34] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [35] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 9 page program sequence
                [36] = FLEXSPI_LUT_SEQ(CMD_SDR, FLEXSPI_1PAD, 0x02, RADDR_SDR, FLEXSPI_1PAD, 0x18),
                [37] = FLEXSPI_LUT_SEQ(WRITE_SDR, FLEXSPI_1PAD, 0x04, STOP, 0, 0),
                [38] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [39] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 10 Reserved
                [40] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [41] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [42] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [43] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 11 chip erase sequence
                [44] = FLEXSPI_LUT_SEQ(CMD_SDR, FLEXSPI_1PAD, 0x60, STOP, 0, 0),
                [45] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [46] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [47] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 12 Reserved
                [48] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [49] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [50] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [51] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 13 Reserved
                [52] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [53] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [54] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [55] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 14 Reserved
                [56] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [57] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [58] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [59] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                // 15 Reserved
                [60] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [61] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [62] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
                [63] = FLEXSPI_LUT_SEQ(STOP, 0, 0, STOP, 0, 0),
            },
    },
    .pageSize = 256,
    .sectorSize = 4096,
    .ipcmdSerialClkFreq = kFlexSpiSerialClk_30MHz,
    .isUniformBlockSize = 0,
    .serialNorType = 0,
    .needExitNoCmdMode = 0,
    .halfClkForNonReadCmd = 0,
    .needRestoreNoCmdMode = 0,
    .blockSize = 0x10000,
};

jeremyzhou · ‎04-12-2020

Hi Kamran Farhangi ,

Sorry for reply late.
I do the similar testing as you did (Fig 1), comparing the executing time of mbedtls library function between disabling and enabling AHB Read Prefetch feature circumstances, however, I find the result is versus to your previous result.
I've attached the testing demo, maybe you can give a try.

Fig 1

Have a great day,
TIC

-------------------------------------------------------------------------------
Note:
- If this post answers your question, please click the "Mark Correct" button. Thank you!

- We are following threads for 7 weeks after the last post, later replies are ignored
Please open a new thread and refer to the closed one, if you have a related question at a later point in time.
-------------------------------------------------------------------------------

在原帖中查看解决方案

k_farhangi · ‎04-06-2020

Hi jeremyzhou‌,

Sorry for the late reply,

The code that I had mentioned earlier is a part of a bigger project and I can not upload the whole project here. I isolated the part that is related to my question and attached to this reply. You probably need to include the cmake file and possibly change a few thing so that it compiles on your setup. Thank you in advance.

jeremyzhou · ‎04-12-2020

Hi Kamran Farhangi ,

Sorry for reply late.
I do the similar testing as you did (Fig 1), comparing the executing time of mbedtls library function between disabling and enabling AHB Read Prefetch feature circumstances, however, I find the result is versus to your previous result.
I've attached the testing demo, maybe you can give a try.

Fig 1

Have a great day,
TIC

-------------------------------------------------------------------------------
Note:
- If this post answers your question, please click the "Mark Correct" button. Thank you!

- We are following threads for 7 weeks after the last post, later replies are ignored
Please open a new thread and refer to the closed one, if you have a related question at a later point in time.
-------------------------------------------------------------------------------

k_farhangi · ‎04-19-2020

Thanks, then I guess something else is happening while I'm performing this test, like some thread switching in the background or so, because I run this example inside an RTOS and with a minimized number of background tasks. Now I know that the prefetcher does increase the performance in an example like this.

k_farhangi · ‎04-02-2020

Hi jeremyzhou‌,

Thank you for your reply. To test the code execution time, I am using a jpeg decoder and I have made 8 copies of it with different function names so that it is big enough to fill i-cache between iterations. Here is the code:

while(true)
{
 uint32_t maxNs = 0;
 uint32_t minNs = UINT32_MAX;
 uint64_t SumNs = 0;
 uint32_t count = 1000;

 for(uint32_t i = 0; i < count; i++)
 {
 uint32_t start = DWT->CYCCNT;

 dec decoder;
 decoder.decode(pic, sizeof(pic));
 dec2 decoder2;
 decoder2.decode(pic, sizeof(pic));
 dec3 decoder3;
 decoder3.decode(pic, sizeof(pic));
 dec4 decoder4;
 decoder4.decode(pic, sizeof(pic));

 dec5 decoder5;
 decoder5.decode(pic, sizeof(pic));
 dec6 decoder6;
 decoder6.decode(pic, sizeof(pic));
 dec7 decoder7;
 decoder7.decode(pic, sizeof(pic));
 dec8 decoder8;
 decoder8.decode(pic, sizeof(pic));

 uint32_t end = DWT->CYCCNT;
 volatile uint32_t ns = GetNs(start, end);

 if(ns > maxNs)
 maxNs = ns;
 if(ns < minNs)
 minNs = ns;
 SumNs += ns;
 }

 uint32_t mean = SumNs / count;

 printf("Timings: Mean = %u us, Min = %u us, Max = %u us", mean / 1000, minNs / 1000,
 maxNs / 1000);
}

When the prefetch bit is set, the mean execution time (the one that is printed out) is about 2372 microseconds. Then if I set a breakpoint somewhere outside the for loop and set the AHBCR->PREFETCHEN register to zero, the code execution time will be 2090 microseconds, about 88% of the time with prefetcher being on. This behavior is pretty consistent and if I set AHBCR->PREFETCHEN to one again, it will jump up to 2372 microseconds again.

jeremyzhou · ‎04-02-2020

Hi Kamran Farhangi ,

Thanks for your reply.
I was wondering if you can share the complete demo, as I'd like to do testing on my site, in further, I'd like to suggest you run the mbedtls_selftest which performs a variety of cryptographic algorithms to double confirm the aforementioned phenomenon.

Have a great day,
TIC

-------------------------------------------------------------------------------
Note:
- If this post answers your question, please click the "Mark Correct" button. Thank you!

- We are following threads for 7 weeks after the last post, later replies are ignored
Please open a new thread and refer to the closed one, if you have a related question at a later point in time.
-------------------------------------------------------------------------------

jeremyzhou · ‎04-01-2020

Hi Kamran Farhangi ,

Thank you for your interest in NXP Semiconductor products and
for the opportunity to serve you.
Before answering your question, I'd like to get more information, so I was wondering if you can present the testing steps you did in detail.
Looking forward to your reply.

Have a great day,
TIC

-------------------------------------------------------------------------------
Note:
- If this post answers your question, please click the "Mark Correct" button. Thank you!

- We are following threads for 7 weeks after the last post, later replies are ignored
Please open a new thread and refer to the closed one, if you have a related question at a later point in time.
-------------------------------------------------------------------------------