From 7dddd9e578fb0bd553f8ec9d615fdff109d06c5a Mon Sep 17 00:00:00 2001
From: Adam Green <adamgr@foo.bar>
Date: Sun, 25 Aug 2013 00:43:15 -0700
Subject: [PATCH] Asm versions of netstack memcpy() and lwip_standard_chksum()

For tests such as TCPEchoServer
(http://mbed.org/users/emilmont/notebook/networking-libraries-benchmark/)
this change showed a 28% improvement (14Mbps to 18Mbps) when the echo
test was modified to instead use 1K data buffers.

I targetted these two functions based on manual profiling samples which
showed that a great deal of time was being spent in these two functions
when the network stack was being slammed with UDP packets.
---
 libraries/net/lwip/lwip-sys/arch/cc.h       |  17 ++-
 libraries/net/lwip/lwip-sys/arch/checksum.c | 126 ++++++++++++++++++++
 libraries/net/lwip/lwip-sys/arch/memcpy.c   |  59 +++++++++
 3 files changed, 200 insertions(+), 2 deletions(-)
 create mode 100644 libraries/net/lwip/lwip-sys/arch/checksum.c
 create mode 100644 libraries/net/lwip/lwip-sys/arch/memcpy.c

diff --git a/libraries/net/lwip/lwip-sys/arch/cc.h b/libraries/net/lwip/lwip-sys/arch/cc.h
index 394635a612..a17082e2ba 100644
--- a/libraries/net/lwip/lwip-sys/arch/cc.h
+++ b/libraries/net/lwip/lwip-sys/arch/cc.h
@@ -82,8 +82,21 @@ typedef uintptr_t          mem_ptr_t;
     #define ALIGNED(n)  __attribute__((aligned (n)))
 #endif 
 
-/* Used with IP headers only */
-#define LWIP_CHKSUM_ALGORITHM 1
+/* Provide Thumb-2 routines for GCC to improve performance */
+#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
+    #define MEMCPY(dst,src,len)     thumb2_memcpy(dst,src,len)
+    #define LWIP_CHKSUM             thumb2_checksum
+    /* Set algorithm to 0 so that unused lwip_standard_chksum function
+       doesn't generate compiler warning */
+    #define LWIP_CHKSUM_ALGORITHM   0
+
+    void* thumb2_memcpy(void* pDest, const void* pSource, size_t length);
+    u16_t thumb2_checksum(void* pData, int length);
+#else
+    /* Used with IP headers only */
+    #define LWIP_CHKSUM_ALGORITHM   1
+#endif
+
 
 #ifdef LWIP_DEBUG
 
diff --git a/libraries/net/lwip/lwip-sys/arch/checksum.c b/libraries/net/lwip/lwip-sys/arch/checksum.c
new file mode 100644
index 0000000000..e97aef39d0
--- /dev/null
+++ b/libraries/net/lwip/lwip-sys/arch/checksum.c
@@ -0,0 +1,126 @@
+/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
+
+
+/* This is a hand written Thumb-2 assembly language version of the
+   algorithm 3 version of lwip_standard_chksum in lwIP's inet_chksum.c.  It
+   performs the checksumming 32-bits at a time and even unrolls the loop to
+   perform two of these 32-bit adds per loop iteration.
+   
+   Returns:
+        16-bit 1's complement summation (not inversed).
+        
+   NOTE: This function does return a uint16_t from the assembly language code
+         but is marked as void so that GCC doesn't issue warning because it
+         doesn't know about this low level return.
+*/
+__attribute__((naked)) void /*uint16_t*/ thumb2_checksum(const void* pData, int length)
+{
+    __asm (
+        ".syntax unified\n"
+        ".thumb\n"
+
+        // Push non-volatile registers we use on stack.  Push link register too to
+        // keep stack 8-byte aligned and allow single pop to restore and return.
+        "    push        {r4, lr}\n"
+        // Initialize sum, r2, to 0.
+        "    movs    r2, #0\n"
+        // Remember whether pData was at odd address in r3.  This is used later to
+        // know if it needs to swap the result since the summation will be done at
+        // an offset of 1, rather than 0.
+        "    ands    r3, r0, #1\n"
+        // Need to 2-byte align?  If not skip ahead.
+        "    beq     1$\n"
+        // We can return if there are no bytes to sum.
+        "    cbz     r1, 9$\n"
+
+        // 2-byte align.
+        // Place the first data byte in odd summation location since it needs to be
+        // swapped later.  It's ok to overwrite r2 here as it only had a value of 0
+        // up until now.  Advance r0 pointer and decrement r1 length as we go.
+        "    ldrb    r2, [r0], #1\n"
+        "    lsls    r2, r2, #8\n"
+        "    subs    r1, r1, #1\n"
+
+        // Need to 4-byte align?  If not skip ahead.
+        "1$:\n"
+        "    ands    r4, r0, #3\n"
+        "    beq     2$\n"
+        // Have more than 1 byte left to align?  If not skip ahead to take care of
+        // trailing byte.
+        "    cmp     r1, #2\n"
+        "    blt     7$\n"
+
+        // 4-byte align.
+        "    ldrh    r4, [r0], #2\n"
+        "    adds    r2, r2, r4\n"
+        "    subs    r1, r1, #2\n"
+
+        // Main summing loop which sums up data 2 words at a time.
+        // Make sure that we have more than 7 bytes left to sum.
+        "2$:\n"
+        "    cmp     r1, #8\n"
+        "    blt     3$\n"
+        // Sum next two words.  Applying previous upper 16-bit carry to
+        // lower 16-bits.
+        "    ldr     r4, [r0], #4\n"
+        "    adds    r2, r4\n"
+        "    adc     r2, r2, #0\n"
+        "    ldr     r4, [r0], #4\n"
+        "    adds    r2, r4\n"
+        "    adc     r2, r2, #0\n"
+        "    subs    r1, r1, #8\n"
+        "    b       2$\n"
+
+        // Sum up any remaining half-words.
+        "3$:\n"
+        // Make sure that we have more than 1 byte left to sum.
+        "    cmp     r1, #2\n"
+        "    blt     7$\n"
+        // Sum up next half word, continue to apply carry.
+        "    ldrh    r4, [r0], #2\n"
+        "    adds    r2, r4\n"
+        "    adc     r2, r2, #0\n"
+        "    subs    r1, r1, #2\n"
+        "    b       3$\n"
+
+        // Handle trailing byte, if it exists
+        "7$:\n"
+        "    cbz     r1, 8$\n"
+        "    ldrb    r4, [r0]\n"
+        "    adds    r2, r4\n"
+        "    adc     r2, r2, #0\n"
+
+        // Fold 32-bit checksum into 16-bit checksum.
+        "8$:\n"
+        "    ubfx    r4, r2, #16, #16\n"
+        "    ubfx    r2, r2, #0, #16\n"
+        "    adds    r2, r4\n"
+        "    ubfx    r4, r2, #16, #16\n"
+        "    ubfx    r2, r2, #0, #16\n"
+        "    adds    r2, r4\n"
+
+        // Swap bytes if started at odd address
+        "    cbz     r3, 9$\n"
+        "    rev16   r2, r2\n"
+
+        // Return final sum.
+        "9$: mov     r0, r2\n"
+        "    pop     {r4, pc}\n"
+    );
+}
+
+#endif
diff --git a/libraries/net/lwip/lwip-sys/arch/memcpy.c b/libraries/net/lwip/lwip-sys/arch/memcpy.c
new file mode 100644
index 0000000000..fefbcda3c2
--- /dev/null
+++ b/libraries/net/lwip/lwip-sys/arch/memcpy.c
@@ -0,0 +1,59 @@
+/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
+
+#include <stdio.h>
+
+
+/* This is a hand written Thumb-2 assembly language version of the
+   standard C memcpy() function that can be used by the lwIP networking
+   stack to improve its performance.  It copies 4 bytes at a time and
+   unrolls the loop to perform 4 of these copies per loop iteration.
+*/
+__attribute__((naked)) void thumb2_memcpy(void* pDest, const void* pSource, size_t length)
+{
+    __asm (
+        ".syntax unified\n"
+        ".thumb\n"
+
+        // Copy 16 bytes at a time first.
+        "    lsrs    r3, r2, #4\n"
+        "    beq.n   2$\n"
+        "1$: ldr     r12, [r1], #4\n"
+        "    str     r12, [r0], #4\n"
+        "    ldr     r12, [r1], #4\n"
+        "    str     r12, [r0], #4\n"
+        "    ldr     r12, [r1], #4\n"
+        "    str     r12, [r0], #4\n"
+        "    ldr     r12, [r1], #4\n"
+        "    str     r12, [r0], #4\n"
+        "    subs    r3, #1\n"
+        "    bne     1$\n"
+
+        // Copy byte by byte for what is left.
+        "2$:\n"
+        "    ands    r3, r2, #0xf\n"
+        "    beq.n   4$\n"
+        "3$: ldrb    r12, [r1], #1\n"
+        "    strb    r12, [r0], #1\n"
+        "    subs    r3, #1\n"
+        "    bne     3$\n"
+
+        // Return to caller.
+        "4$: bx      lr\n"
+    );
+}
+
+#endif