diff --git a/TESTS/mbed_platform/wait_ns/main.cpp b/TESTS/mbed_platform/wait_ns/main.cpp
new file mode 100644
index 0000000000..99daca770c
--- /dev/null
+++ b/TESTS/mbed_platform/wait_ns/main.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018, ARM Limited, All Rights Reserved
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mbed.h"
+#include "greentea-client/test_env.h"
+#include "unity.h"
+#include "utest.h"
+#include "platform/mbed_wait_api.h"
+#include "hal/us_ticker_api.h"
+#include "hal/lp_ticker_api.h"
+
+using namespace utest::v1;
+
+/* This test is created based on the test for Timer class.
+ * Since low power timer is less accurate than regular
+ * timer we need to adjust delta.
+ */
+
+/*
+ * Define tolerance as follows:
+ * Timer might be +/-5% out; wait_ns is permitted 40% slow, but not fast.
+ * Therefore minimum measured time should be 95% of requested, maximum should
+ * be 145%. Unity doesn't let us specify an asymmetric error though.
+ *
+ * Would be nice to have tighter upper tolerance, but in practice we've seen
+ * a few devices unable to sustain theoretical throughput - flash wait states?
+ */
+#define TOLERANCE_MIN 0.95f
+#define TOLERANCE_MAX 1.45f
+#define MIDPOINT ((TOLERANCE_MIN+TOLERANCE_MAX)/2)
+#define DELTA (MIDPOINT-TOLERANCE_MIN)
+
+/* This test verifies if wait_ns's wait time
+ * is accurate, according to a timer.
+ *
+ * Given timer is created.
+ * When timer is used to measure delay.
+ * Then the results are valid (within acceptable range).
+ */
+template<int wait_val_ms, class CompareTimer>
+void test_wait_ns_time_measurement()
+{
+    CompareTimer timer;
+
+    float wait_val_s = (float)wait_val_ms / 1000;
+
+    /* Start the timer. */
+    timer.start();
+
+    /* Wait <wait_val_ms> ms - arithmetic inside wait_ns will overflow if
+     * asked for too large a delay, so break it up.
+     */
+    for (int i = 0; i < wait_val_ms; i++) {
+        wait_ns(1000000);
+    }
+
+    /* Stop the timer. */
+    timer.stop();
+
+    /* Check results - wait_val_us us have elapsed. */
+    TEST_ASSERT_FLOAT_WITHIN(DELTA * wait_val_s, MIDPOINT * wait_val_s, timer.read());
+}
+
+utest::v1::status_t test_setup(const size_t number_of_cases)
+{
+    GREENTEA_SETUP(15, "default_auto");
+    return verbose_test_setup_handler(number_of_cases);
+}
+
+Case cases[] = {
+#if DEVICE_LPTICKER
+    Case("Test: wait_ns - compare with lp_timer 1s", test_wait_ns_time_measurement<1000, LowPowerTimer>),
+#endif
+    Case("Test: wait_ns - compare with us_timer 1s", test_wait_ns_time_measurement<1000, Timer>)
+};
+
+Specification specification(test_setup, cases);
+
+int main()
+{
+    return !Harness::run(specification);
+}
diff --git a/platform/mbed_wait_api.h b/platform/mbed_wait_api.h
index 9402d6050a..d0463e5da8 100644
--- a/platform/mbed_wait_api.h
+++ b/platform/mbed_wait_api.h
@@ -78,11 +78,43 @@ void wait_ms(int ms);
  *
  *  @note
  *    This function always spins to get the exact number of microseconds.
- *    If RTOS is present, this will affect power (by preventing deep sleep) and
- *    multithread performance. Therefore, spinning for millisecond wait is not recommended.
+ *    This will affect power and multithread performance. Therefore, spinning for
+ *    millisecond wait is not recommended, and wait_ms() should
+ *    be used instead.
+ *
+ *  @note You may call this function from ISR context, but large delays may
+ *    impact system stability - interrupt handlers should take less than
+ *    50us.
  */
 void wait_us(int us);
 
+/** Waits a number of nanoseconds.
+ *
+ * This function spins the CPU to produce a small delay. It should normally
+ * only be used for delays of 10us (10000ns) or less. As it is calculated
+ * based on the expected execution time of a software loop, it may well run
+ * slower than requested based on activity from other threads and interrupts.
+ * If greater precision is required, this can be called from inside a critical
+ * section.
+ *
+ *  @param ns the number of nanoseconds to wait
+ *
+ *  @note
+ *    wait_us() will likely give more precise time than wait_ns for large-enough
+ *    delays, as it is based on a timer, but its set-up time may be excessive
+ *    for the smallest microsecond counts, at which point wait_ns() is better.
+ *
+ *  @note
+ *    Any delay larger than a millisecond (1000000ns) is liable to cause
+ *    overflow in the internal loop calculation. You shouldn't normally be
+ *    using this for such large delays anyway in real code, but be aware if
+ *    calibrating. Make repeated calls for longer test runs.
+ *
+ *  @note You may call this function from ISR context.
+ *
+ */
+void wait_ns(unsigned int ns);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/platform/mbed_wait_api_no_rtos.c b/platform/mbed_wait_api_no_rtos.c
index d03840e866..6c9523361c 100644
--- a/platform/mbed_wait_api_no_rtos.c
+++ b/platform/mbed_wait_api_no_rtos.c
@@ -15,11 +15,14 @@
  * limitations under the License.
  */
 
+#include "cmsis.h"
+#include "platform/mbed_toolchain.h"
+#include "platform/mbed_wait_api.h"
+
 // This implementation of the wait functions will be compiled only
 // if the RTOS is not present.
 #ifndef MBED_CONF_RTOS_PRESENT
 
-#include "platform/mbed_wait_api.h"
 #include "hal/us_ticker_api.h"
 
 void wait(float s)
@@ -41,3 +44,64 @@ void wait_us(int us)
 
 #endif // #ifndef MBED_CONF_RTOS_PRESENT
 
+// This wait_ns is used by both RTOS and non-RTOS builds
+
+#ifdef __CORTEX_M
+#if (__CORTEX_M == 0 && !defined __CM0PLUS_REV) || __CORTEX_M == 1
+// Cortex-M0 and Cortex-M1 take 6 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 3
+#define LOOP_SCALER 6000
+#elif (__CORTEX_M == 0 && defined __CM0PLUS_REV) || __CORTEX_M == 3 || __CORTEX_M == 4 || \
+      __CORTEX_M == 23 || __CORTEX_M == 33
+// Cortex-M0+, M3, M4, M23 and M33 take 5 cycles per iteration - SUBS = 1, 2xNOP = 2, BCS = 2
+// TODO - check M33
+#define LOOP_SCALER 5000
+#elif __CORTEX_M == 7
+// Cortex-M7 manages to dual-issue for 2 cycles per iteration (SUB,NOP) = 1, (NOP,BCS) = 1
+// (The NOPs were added to stabilise this - with just the SUB and BCS, it seems that the
+// M7 sometimes takes 1 cycle, sometimes 2, possibly depending on alignment)
+#define LOOP_SCALER 2000
+#endif
+#elif defined __CORTEX_A
+#if __CORTEX_A == 9
+// Cortex-A9 is dual-issue, so let's assume same performance as Cortex-M7.
+// TODO - test.
+#define LOOP_SCALER 2000
+#endif
+#endif
+
+/* We only define the function if we've identified the CPU. If we haven't,
+ * rather than a compile-time error, leave it undefined, rather than faulting
+ * with an immediate #error. This leaves the door open to non-ARM
+ * builds with or people providing substitutes for other CPUs, and only if
+ * needed.
+ */
+#ifdef LOOP_SCALER
+
+/* Timing seems to depend on alignment, and toolchains do not support aligning
+ * functions well. So sidestep that by hand-assembling the code. Also avoids
+ * the hassle of handling multiple toolchains with different assembler
+ * syntax.
+ */
+MBED_ALIGN(8)
+static const uint16_t delay_loop_code[] = {
+    0x1E40, // SUBS R0,R0,#1
+    0xBF00, // NOP
+    0xBF00, // NOP
+    0xD2FB, // BCS .-3        (0x00 would be .+2, so 0xFB = -5 = .-3)
+    0x4770  // BX LR
+};
+
+/* Take the address of the code, set LSB to indicate Thumb, and cast to void() function pointer */
+#define delay_loop ((void(*)()) ((uintptr_t) delay_loop_code | 1))
+
+void wait_ns(unsigned int ns)
+{
+    uint32_t cycles_per_us = SystemCoreClock / 1000000;
+    // Note that this very calculation, plus call overhead, will take multiple
+    // cycles. Could well be 100ns on its own... So round down here, startup is
+    // worth at least one loop iteration.
+    uint32_t count = (cycles_per_us * ns) / LOOP_SCALER;
+
+    delay_loop(count);
+}
+#endif // LOOP_SCALER
diff --git a/targets/TARGET_NUVOTON/TARGET_M2351/device/M2351.h b/targets/TARGET_NUVOTON/TARGET_M2351/device/M2351.h
index b53a38c018..b61abfd967 100644
--- a/targets/TARGET_NUVOTON/TARGET_M2351/device/M2351.h
+++ b/targets/TARGET_NUVOTON/TARGET_M2351/device/M2351.h
@@ -192,7 +192,7 @@ typedef enum IRQn
 /*@}*/ /* end of group CMSIS */
 
 
-#include "core_armv8mbl.h"                  /* Processor and core peripherals */
+#include "core_cm23.h"                      /* Processor and core peripherals */
 #include "system_M2351.h"                   /* System Header */
 
 /**