diff --git a/platform/mbed_wait_api_no_rtos.c b/platform/mbed_wait_api_no_rtos.c
index 7fca558709..6c9523361c 100644
--- a/platform/mbed_wait_api_no_rtos.c
+++ b/platform/mbed_wait_api_no_rtos.c
@@ -16,6 +16,7 @@
  */
 
 #include "cmsis.h"
+#include "platform/mbed_toolchain.h"
 #include "platform/mbed_wait_api.h"
 
 // This implementation of the wait functions will be compiled only
@@ -47,11 +48,11 @@ void wait_us(int us)
 
 #ifdef __CORTEX_M
 #if (__CORTEX_M == 0 && !defined __CM0PLUS_REV) || __CORTEX_M == 1
-// Cortex-M0 and Cortex-M1 take 7 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 3
+// Cortex-M0 and Cortex-M1 take 6 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 3
 #define LOOP_SCALER 6000
 #elif (__CORTEX_M == 0 && defined __CM0PLUS_REV) || __CORTEX_M == 3 || __CORTEX_M == 4 || \
       __CORTEX_M == 23 || __CORTEX_M == 33
-// Cortex-M0+, M3, M4, M23 and M33 take 6 cycles per iteration - SUBS = 1, 3xNOP = 2, BCS = 2
+// Cortex-M0+, M3, M4, M23 and M33 take 5 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 2
 // TODO - check M33
 #define LOOP_SCALER 5000
 #elif __CORTEX_M == 7
@@ -76,52 +77,22 @@ void wait_us(int us)
  */
 #ifdef LOOP_SCALER
 
-// *INDENT-OFF*
-#ifdef __CC_ARM /* ARMC5 */
-__asm static void delay_loop(uint32_t count)
-{
-1
-  SUBS a1, a1, #1
-  NOP
-  NOP
-  BCS  %BT1
-  BX   lr
-}
-#elif defined (__ICCARM__)
-static void delay_loop(uint32_t count)
-{
-  __asm volatile(
-    "loop: \n"
-    " SUBS %0, %0, #1 \n"
-    " NOP\n"
-    " NOP\n"
-    " BCS.n  loop\n"
-    : "+r" (count)
-    :
-    : "cc"
-  );
-}
-#else // GCC or ARMC6
-static void delay_loop(uint32_t count)
-{
-  __asm__ volatile (
-    "%=:\n\t"
-/* Only GCC insists on non-UAL assembly for Thumb v1 */
-#if !defined(__ARMCC_VERSION) && defined(__thumb__) && !defined(__thumb2__)
-    "SUB  %0, #1\n\t"
-#else
-    "SUBS %0, %0, #1\n\t"
-#endif
-    "NOP\n\t"
-    "NOP\n\t"
-    "BCS  %=b\n\t"
-    : "+l" (count)
-    :
-    : "cc"
-  );
-}
-#endif
-// *INDENT-ON*
+/* Timing seems to depend on alignment, and toolchains do not support aligning
+ * functions well. So sidestep that by hand-assembling the code. Also avoids
+ * the hassle of handling multiple toolchains with different assembler
+ * syntax.
+ */
+MBED_ALIGN(8)
+static const uint16_t delay_loop_code[] = {
+    0x1E40, // SUBS R0,R0,#1
+    0xBF00, // NOP
+    0xBF00, // NOP
+    0xD2FB, // BCS .-3        (0x00 would be .+2, so 0xFB = -5 = .-3)
+    0x4770  // BX LR
+};
+
+/* Take the address of the code, set LSB to indicate Thumb, and cast to void() function pointer */
+#define delay_loop ((void(*)()) ((uintptr_t) delay_loop_code | 1))
 
 void wait_ns(unsigned int ns)
 {