Add wait_ns API

This provides the ability to generate really small delays - it's often
the case that wait_us() takes multiple microseconds to set up, so
having an alternative suitable for <10us delays is useful.

There have been a few local implementations - it makes sense to
centralise them as they need retuning for each new ARM core.

Based on the local implementation inside the Atmel 802.15.4 driver.
pull/9812/head
Kevin Bracey 2019-01-31 10:40:26 +02:00
parent b08ddaad8b
commit 7215515880
2 changed files with 128 additions and 3 deletions

View File

@ -78,11 +78,43 @@ void wait_ms(int ms);
*
* @note
* This function always spins to get the exact number of microseconds.
* If RTOS is present, this will affect power (by preventing deep sleep) and
* multithread performance. Therefore, spinning for millisecond wait is not recommended.
* This will affect power and multithread performance. Therefore, spinning for
* millisecond wait is not recommended, and wait_ms() should
* be used instead.
*
* @note You may call this function from ISR context, but large delays may
* impact system stability - interrupt handlers should take less than
* 50us.
*/
void wait_us(int us);
/** Waits a number of nanoseconds.
*
* This function spins the CPU to produce a small delay. It should normally
* only be used for delays of 10us (10000ns) or less. As it is calculated
* based on the expected execution time of a software loop, it may well run
* slower than requested based on activity from other threads and interrupts.
* If greater precision is required, this can be called from inside a critical
* section.
*
* @param ns the number of nanoseconds to wait
*
* @note
* wait_us() will likely give more precise time than wait_ns for large-enough
* delays, as it is based on a timer, but its set-up time may be excessive
* for the smallest microsecond counts, at which point wait_ns() is better.
*
* @note
* Any delay larger than a millisecond (1000000ns) is liable to cause
* overflow in the internal loop calculation. You shouldn't normally be
* using this for such large delays anyway in real code, but be aware if
* calibrating. Make repeated calls for longer test runs.
*
* @note You may call this function from ISR context.
*
*/
void wait_ns(unsigned int ns);
#ifdef __cplusplus
}
#endif

View File

@ -15,11 +15,13 @@
* limitations under the License.
*/
#include "cmsis.h"
#include "platform/mbed_wait_api.h"
// This implementation of the wait functions will be compiled only
// if the RTOS is not present.
#ifndef MBED_CONF_RTOS_PRESENT
#include "platform/mbed_wait_api.h"
#include "hal/us_ticker_api.h"
void wait(float s)
@ -41,3 +43,94 @@ void wait_us(int us)
#endif // #ifndef MBED_CONF_RTOS_PRESENT
// This wait_ns is used by both RTOS and non-RTOS builds
#ifdef __CORTEX_M
#if (__CORTEX_M == 0 && !defined __CM0PLUS_REV) || __CORTEX_M == 1
// Cortex-M0 and Cortex-M1 take 7 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 3
#define LOOP_SCALER 6000
#elif (__CORTEX_M == 0 && defined __CM0PLUS_REV) || __CORTEX_M == 3 || __CORTEX_M == 4 || \
__CORTEX_M == 23 || __CORTEX_M == 33
// Cortex-M0+, M3, M4, M23 and M33 take 6 cycles per iteration - SUBS = 1, 3xNOP = 2, BCS = 2
// TODO - check M33
#define LOOP_SCALER 5000
#elif __CORTEX_M == 7
// Cortex-M7 manages to dual-issue for 2 cycles per iteration (SUB,NOP) = 1, (NOP,BCS) = 1
// (The NOPs were added to stabilise this - with just the SUB and BCS, it seems that the
// M7 sometimes takes 1 cycle, sometimes 2, possibly depending on alignment)
#define LOOP_SCALER 2000
#endif
#elif defined __CORTEX_A
#if __CORTEX_A == 9
// Cortex-A9 is dual-issue, so let's assume same performance as Cortex-M7.
// TODO - test.
#define LOOP_SCALER 2000
#endif
#endif
/* We only define the function if we've identified the CPU. If we haven't,
* rather than a compile-time error, leave it undefined, rather than faulting
* with an immediate #error. This leaves the door open to non-ARM
* builds with or people providing substitutes for other CPUs, and only if
* needed.
*/
#ifdef LOOP_SCALER
// *INDENT-OFF*
#ifdef __CC_ARM /* ARMC5 */
__asm static void delay_loop(uint32_t count)
{
1
SUBS a1, a1, #1
NOP
NOP
BCS %BT1
BX lr
}
#elif defined (__ICCARM__)
static void delay_loop(uint32_t count)
{
__asm volatile(
"loop: \n"
" SUBS %0, %0, #1 \n"
" NOP\n"
" NOP\n"
" BCS.n loop\n"
: "+r" (count)
:
: "cc"
);
}
#else // GCC or ARMC6
static void delay_loop(uint32_t count)
{
__asm__ volatile (
"%=:\n\t"
/* Only GCC insists on non-UAL assembly for Thumb v1 */
#if !defined(__ARMCC_VERSION) && defined(__thumb__) && !defined(__thumb2__)
"SUB %0, #1\n\t"
#else
"SUBS %0, %0, #1\n\t"
#endif
"NOP\n\t"
"NOP\n\t"
"BCS %=b\n\t"
: "+l" (count)
:
: "cc"
);
}
#endif
// *INDENT-ON*
void wait_ns(unsigned int ns)
{
uint32_t cycles_per_us = SystemCoreClock / 1000000;
// Note that this very calculation, plus call overhead, will take multiple
// cycles. Could well be 100ns on its own... So round down here, startup is
// worth at least one loop iteration.
uint32_t count = (cycles_per_us * ns) / LOOP_SCALER;
delay_loop(count);
}
#endif // LOOP_SCALER