> Can you please show me the exact location where you added the .align directive?
The code is in arch/arm/lib/delay.S.
Between Kernel 3.5 and 3.6 this file was renamed to "delay-loop.S", so if you're using a new kernel you'll have to look at that file, and make sure the build is using that one and not "delay.c" and not replacing it with a timer-based one.
The function "__udelay()" falls into "__delay()" so you can't directly align the __delay() function.
I added an "align" directive as shown below. With this code that results in "__delay()" being on an ODD 4-byte boundary and it runs slow. Adding a single "nop" just before "__delay()" changes it to the "fast" behaviour.
/*
* linux/arch/arm/lib/delay.S
*
* Copyright (C) 1995, 1996 Russell King
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/param.h>
.text
.align 5 /* __delay() 2/3 speed on odd-32-bit alignment */
.LC0: .word loops_per_jiffy
.LC1: .word (2199023*HZ)>>11
/*
* r0 <= 2000
* lpj <= 0x01ffffff (max. 3355 bogomips)
* HZ <= 1000
*/
ENTRY(__udelay)
ldr r2, .LC1
mul r0, r2, r0
ENTRY(__const_udelay) @ 0 <= r0 <= 0x7fffff06
mov r1, #-1
ldr r2, .LC0
ldr r2, [r2] @ max = 0x01ffffff
add r0, r0, r1, lsr #32-14
mov r0, r0, lsr #14 @ max = 0x0001ffff
add r2, r2, r1, lsr #32-10
mov r2, r2, lsr #10 @ max = 0x00007fff
mul r0, r2, r0 @ max = 2^32-1
add r0, r0, r1, lsr #32-6
movs r0, r0, lsr #6
moveq pc, lr
/*
* loops = r0 * HZ * loops_per_jiffy / 1000000
*
* Oh, if only we had a cycle counter...
*/
/* Add one or more nops in here to change the code alignment of __delay() */
@ Delay routine
ENTRY(__delay)
subs r0, r0, #1
#if 0
movls pc, lr
subs r0, r0, #1
movls pc, lr
subs r0, r0, #1
movls pc, lr
subs r0, r0, #1
movls pc, lr
subs r0, r0, #1
movls pc, lr
subs r0, r0, #1
movls pc, lr
subs r0, r0, #1
movls pc, lr
subs r0, r0, #1
#endif
bhi __delay
mov pc, lr
ENDPROC(__udelay)
ENDPROC(__const_udelay)
ENDPROC(__delay)
I got rid of the "#if, #endif" above and it got 1273 BogoMIPs at 800MHz. From the news posts mentioned previously, this was apparently for old and slow ARM CPUs.
Tom