mirror of https://github.com/ARMmbed/mbed-os.git
Merge pull request #51 from adamgreen/netMorePerformanceWork
Asm versions of netstack memcpy() and lwip_standard_chksum() [Note] I'm generally a bit reluctant when including optimizations like this (from an architectural standpoint), because they tend to be a bit too specific (for example, this one works only with lwIP+GCC+Cortex-M3 or M4), but for now it looks as this is the right place for them, although the optimized memcpy should ideally be in libc (or even better replaced with a DMA transfer in this particular case). But this will be both a nice optimization and a reminder of what we need to implement/change in the future.pull/48/merge
commit
42e27e70b9
|
@ -82,8 +82,21 @@ typedef uintptr_t mem_ptr_t;
|
||||||
#define ALIGNED(n) __attribute__((aligned (n)))
|
#define ALIGNED(n) __attribute__((aligned (n)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Used with IP headers only */
|
/* Provide Thumb-2 routines for GCC to improve performance */
|
||||||
#define LWIP_CHKSUM_ALGORITHM 1
|
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
|
||||||
|
#define MEMCPY(dst,src,len) thumb2_memcpy(dst,src,len)
|
||||||
|
#define LWIP_CHKSUM thumb2_checksum
|
||||||
|
/* Set algorithm to 0 so that unused lwip_standard_chksum function
|
||||||
|
doesn't generate compiler warning */
|
||||||
|
#define LWIP_CHKSUM_ALGORITHM 0
|
||||||
|
|
||||||
|
void* thumb2_memcpy(void* pDest, const void* pSource, size_t length);
|
||||||
|
u16_t thumb2_checksum(void* pData, int length);
|
||||||
|
#else
|
||||||
|
/* Used with IP headers only */
|
||||||
|
#define LWIP_CHKSUM_ALGORITHM 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef LWIP_DEBUG
|
#ifdef LWIP_DEBUG
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,126 @@
|
||||||
|
/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
|
||||||
|
|
||||||
|
|
||||||
|
/* This is a hand written Thumb-2 assembly language version of the
|
||||||
|
algorithm 3 version of lwip_standard_chksum in lwIP's inet_chksum.c. It
|
||||||
|
performs the checksumming 32-bits at a time and even unrolls the loop to
|
||||||
|
perform two of these 32-bit adds per loop iteration.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
16-bit 1's complement summation (not inversed).
|
||||||
|
|
||||||
|
NOTE: This function does return a uint16_t from the assembly language code
|
||||||
|
but is marked as void so that GCC doesn't issue warning because it
|
||||||
|
doesn't know about this low level return.
|
||||||
|
*/
|
||||||
|
__attribute__((naked)) void /*uint16_t*/ thumb2_checksum(const void* pData, int length)
|
||||||
|
{
|
||||||
|
__asm (
|
||||||
|
".syntax unified\n"
|
||||||
|
".thumb\n"
|
||||||
|
|
||||||
|
// Push non-volatile registers we use on stack. Push link register too to
|
||||||
|
// keep stack 8-byte aligned and allow single pop to restore and return.
|
||||||
|
" push {r4, lr}\n"
|
||||||
|
// Initialize sum, r2, to 0.
|
||||||
|
" movs r2, #0\n"
|
||||||
|
// Remember whether pData was at odd address in r3. This is used later to
|
||||||
|
// know if it needs to swap the result since the summation will be done at
|
||||||
|
// an offset of 1, rather than 0.
|
||||||
|
" ands r3, r0, #1\n"
|
||||||
|
// Need to 2-byte align? If not skip ahead.
|
||||||
|
" beq 1$\n"
|
||||||
|
// We can return if there are no bytes to sum.
|
||||||
|
" cbz r1, 9$\n"
|
||||||
|
|
||||||
|
// 2-byte align.
|
||||||
|
// Place the first data byte in odd summation location since it needs to be
|
||||||
|
// swapped later. It's ok to overwrite r2 here as it only had a value of 0
|
||||||
|
// up until now. Advance r0 pointer and decrement r1 length as we go.
|
||||||
|
" ldrb r2, [r0], #1\n"
|
||||||
|
" lsls r2, r2, #8\n"
|
||||||
|
" subs r1, r1, #1\n"
|
||||||
|
|
||||||
|
// Need to 4-byte align? If not skip ahead.
|
||||||
|
"1$:\n"
|
||||||
|
" ands r4, r0, #3\n"
|
||||||
|
" beq 2$\n"
|
||||||
|
// Have more than 1 byte left to align? If not skip ahead to take care of
|
||||||
|
// trailing byte.
|
||||||
|
" cmp r1, #2\n"
|
||||||
|
" blt 7$\n"
|
||||||
|
|
||||||
|
// 4-byte align.
|
||||||
|
" ldrh r4, [r0], #2\n"
|
||||||
|
" adds r2, r2, r4\n"
|
||||||
|
" subs r1, r1, #2\n"
|
||||||
|
|
||||||
|
// Main summing loop which sums up data 2 words at a time.
|
||||||
|
// Make sure that we have more than 7 bytes left to sum.
|
||||||
|
"2$:\n"
|
||||||
|
" cmp r1, #8\n"
|
||||||
|
" blt 3$\n"
|
||||||
|
// Sum next two words. Applying previous upper 16-bit carry to
|
||||||
|
// lower 16-bits.
|
||||||
|
" ldr r4, [r0], #4\n"
|
||||||
|
" adds r2, r4\n"
|
||||||
|
" adc r2, r2, #0\n"
|
||||||
|
" ldr r4, [r0], #4\n"
|
||||||
|
" adds r2, r4\n"
|
||||||
|
" adc r2, r2, #0\n"
|
||||||
|
" subs r1, r1, #8\n"
|
||||||
|
" b 2$\n"
|
||||||
|
|
||||||
|
// Sum up any remaining half-words.
|
||||||
|
"3$:\n"
|
||||||
|
// Make sure that we have more than 1 byte left to sum.
|
||||||
|
" cmp r1, #2\n"
|
||||||
|
" blt 7$\n"
|
||||||
|
// Sum up next half word, continue to apply carry.
|
||||||
|
" ldrh r4, [r0], #2\n"
|
||||||
|
" adds r2, r4\n"
|
||||||
|
" adc r2, r2, #0\n"
|
||||||
|
" subs r1, r1, #2\n"
|
||||||
|
" b 3$\n"
|
||||||
|
|
||||||
|
// Handle trailing byte, if it exists
|
||||||
|
"7$:\n"
|
||||||
|
" cbz r1, 8$\n"
|
||||||
|
" ldrb r4, [r0]\n"
|
||||||
|
" adds r2, r4\n"
|
||||||
|
" adc r2, r2, #0\n"
|
||||||
|
|
||||||
|
// Fold 32-bit checksum into 16-bit checksum.
|
||||||
|
"8$:\n"
|
||||||
|
" ubfx r4, r2, #16, #16\n"
|
||||||
|
" ubfx r2, r2, #0, #16\n"
|
||||||
|
" adds r2, r4\n"
|
||||||
|
" ubfx r4, r2, #16, #16\n"
|
||||||
|
" ubfx r2, r2, #0, #16\n"
|
||||||
|
" adds r2, r4\n"
|
||||||
|
|
||||||
|
// Swap bytes if started at odd address
|
||||||
|
" cbz r3, 9$\n"
|
||||||
|
" rev16 r2, r2\n"
|
||||||
|
|
||||||
|
// Return final sum.
|
||||||
|
"9$: mov r0, r2\n"
|
||||||
|
" pop {r4, pc}\n"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,59 @@
|
||||||
|
/* Copyright (C) 2013 - Adam Green (https://github.com/adamgreen)
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
#if defined(TOOLCHAIN_GCC) && defined(__thumb2__)
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
|
||||||
|
/* This is a hand written Thumb-2 assembly language version of the
|
||||||
|
standard C memcpy() function that can be used by the lwIP networking
|
||||||
|
stack to improve its performance. It copies 4 bytes at a time and
|
||||||
|
unrolls the loop to perform 4 of these copies per loop iteration.
|
||||||
|
*/
|
||||||
|
__attribute__((naked)) void thumb2_memcpy(void* pDest, const void* pSource, size_t length)
|
||||||
|
{
|
||||||
|
__asm (
|
||||||
|
".syntax unified\n"
|
||||||
|
".thumb\n"
|
||||||
|
|
||||||
|
// Copy 16 bytes at a time first.
|
||||||
|
" lsrs r3, r2, #4\n"
|
||||||
|
" beq.n 2$\n"
|
||||||
|
"1$: ldr r12, [r1], #4\n"
|
||||||
|
" str r12, [r0], #4\n"
|
||||||
|
" ldr r12, [r1], #4\n"
|
||||||
|
" str r12, [r0], #4\n"
|
||||||
|
" ldr r12, [r1], #4\n"
|
||||||
|
" str r12, [r0], #4\n"
|
||||||
|
" ldr r12, [r1], #4\n"
|
||||||
|
" str r12, [r0], #4\n"
|
||||||
|
" subs r3, #1\n"
|
||||||
|
" bne 1$\n"
|
||||||
|
|
||||||
|
// Copy byte by byte for what is left.
|
||||||
|
"2$:\n"
|
||||||
|
" ands r3, r2, #0xf\n"
|
||||||
|
" beq.n 4$\n"
|
||||||
|
"3$: ldrb r12, [r1], #1\n"
|
||||||
|
" strb r12, [r0], #1\n"
|
||||||
|
" subs r3, #1\n"
|
||||||
|
" bne 3$\n"
|
||||||
|
|
||||||
|
// Return to caller.
|
||||||
|
"4$: bx lr\n"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
Loading…
Reference in New Issue