Teensy 3.1 DMA memcpy() 1092mbs 16-byte aligned DMA

Dependencies:   USBDevice mbed

Teensy 3.1 DMA memcpy, proof of concept

2048 bytes aligned 16
loop set 215.58 mbs 76 us
loop copy 910.22 mbs 18 us
memset 1365.33 mbs 12 us
memcpy 910.22 mbs 18 us
memcpy128 1092.27 mbs 15 us    DMA
errs 0

Obviously, the ARMCC memcpy() is quite fast (unrolled assembler I presume).

You could add IRQ handler if you wanted asynch operations.

Committer:
manitou
Date:
Sat Oct 03 15:15:58 2015 +0000
Revision:
0:d374f051a3ac
teensy 3.1 DMA memcpy

Who changed what in which revision?

UserRevisionLine numberNew contents of line
manitou 0:d374f051a3ac 1 // teensy 3.1 mbed memcpy using DMA
manitou 0:d374f051a3ac 2 // could add IRQ handler for asynch operation
manitou 0:d374f051a3ac 3 #include "mbed.h"
manitou 0:d374f051a3ac 4 #include "USBSerial.h"
manitou 0:d374f051a3ac 5
manitou 0:d374f051a3ac 6 #define PRREG(x) pc.printf(#x" 0x%0x\n",x)
manitou 0:d374f051a3ac 7
manitou 0:d374f051a3ac 8 USBSerial pc; // Virtual serial port over USB
manitou 0:d374f051a3ac 9 Timer tmr;
manitou 0:d374f051a3ac 10
manitou 0:d374f051a3ac 11 #define CHNL 1
manitou 0:d374f051a3ac 12 void dma_init() {
manitou 0:d374f051a3ac 13 SIM->SCGC7 |= SIM_SCGC7_DMA_MASK; // DMA clock
manitou 0:d374f051a3ac 14 // SIM->SCGC6 |= SIM_SCGC6_DMAMUX_MASK; // Enable clock to DMA mux
manitou 0:d374f051a3ac 15 // DMAMUX->CHCFG[CHNL] = 0; // IO to DMA map
manitou 0:d374f051a3ac 16 }
manitou 0:d374f051a3ac 17
manitou 0:d374f051a3ac 18 void memcpy32(void *dest, void *src, unsigned int bytes)
manitou 0:d374f051a3ac 19 {
manitou 0:d374f051a3ac 20 DMA0->TCD[CHNL].SADDR = (uint32_t)src;
manitou 0:d374f051a3ac 21 DMA0->TCD[CHNL].SOFF = 4;
manitou 0:d374f051a3ac 22 DMA0->TCD[CHNL].ATTR = DMA_ATTR_SSIZE(2) | DMA_ATTR_DSIZE(2); //32-bit
manitou 0:d374f051a3ac 23 DMA0->TCD[CHNL].NBYTES_MLNO = bytes;
manitou 0:d374f051a3ac 24 DMA0->TCD[CHNL].SLAST = 0;
manitou 0:d374f051a3ac 25 DMA0->TCD[CHNL].DADDR = (uint32_t)dest;
manitou 0:d374f051a3ac 26 DMA0->TCD[CHNL].DOFF = 4;
manitou 0:d374f051a3ac 27 DMA0->TCD[CHNL].CITER_ELINKNO = 1;
manitou 0:d374f051a3ac 28 DMA0->TCD[CHNL].DLAST_SGA = 0;
manitou 0:d374f051a3ac 29 DMA0->TCD[CHNL].BITER_ELINKNO = 1;
manitou 0:d374f051a3ac 30 DMA0->TCD[CHNL].CSR = DMA_CSR_START_MASK;
manitou 0:d374f051a3ac 31
manitou 0:d374f051a3ac 32 while (!(DMA0->TCD[CHNL].CSR & DMA_CSR_DONE_MASK)) /* wait */ ;
manitou 0:d374f051a3ac 33 }
manitou 0:d374f051a3ac 34
manitou 0:d374f051a3ac 35 void memcpy128(void *dest, void *src, unsigned int bytes)
manitou 0:d374f051a3ac 36 {
manitou 0:d374f051a3ac 37 DMA0->TCD[CHNL].SADDR = (uint32_t)src;
manitou 0:d374f051a3ac 38 DMA0->TCD[CHNL].SOFF = 16;
manitou 0:d374f051a3ac 39 DMA0->TCD[CHNL].ATTR = DMA_ATTR_SSIZE(4) | DMA_ATTR_DSIZE(4);
manitou 0:d374f051a3ac 40 DMA0->TCD[CHNL].NBYTES_MLNO = bytes;
manitou 0:d374f051a3ac 41 DMA0->TCD[CHNL].SLAST = 0;
manitou 0:d374f051a3ac 42 DMA0->TCD[CHNL].DADDR = (uint32_t)dest;
manitou 0:d374f051a3ac 43 DMA0->TCD[CHNL].DOFF = 16;
manitou 0:d374f051a3ac 44 DMA0->TCD[CHNL].CITER_ELINKNO = 1;
manitou 0:d374f051a3ac 45 DMA0->TCD[CHNL].DLAST_SGA = 0;
manitou 0:d374f051a3ac 46 DMA0->TCD[CHNL].BITER_ELINKNO = 1;
manitou 0:d374f051a3ac 47 DMA0->TCD[CHNL].CSR = DMA_CSR_START_MASK;
manitou 0:d374f051a3ac 48
manitou 0:d374f051a3ac 49 while (!(DMA0->TCD[CHNL].CSR & DMA_CSR_DONE_MASK)) /* wait */ ;
manitou 0:d374f051a3ac 50 }
manitou 0:d374f051a3ac 51
manitou 0:d374f051a3ac 52 #define BYTES 2048
manitou 0:d374f051a3ac 53
manitou 0:d374f051a3ac 54 uint8_t src[BYTES] __attribute__ ((aligned (16)));
manitou 0:d374f051a3ac 55 uint8_t dst[BYTES] __attribute__ ((aligned (16)));
manitou 0:d374f051a3ac 56
manitou 0:d374f051a3ac 57
manitou 0:d374f051a3ac 58 void memperf(){
manitou 0:d374f051a3ac 59 int i;
manitou 0:d374f051a3ac 60 uint32_t us;
manitou 0:d374f051a3ac 61
manitou 0:d374f051a3ac 62 pc.printf("\n%d bytes aligned 16\n",BYTES);
manitou 0:d374f051a3ac 63 us = tmr.read_us();
manitou 0:d374f051a3ac 64 for (i=0;i<BYTES;i++) src[i] = i;
manitou 0:d374f051a3ac 65 us = tmr.read_us() - us;
manitou 0:d374f051a3ac 66 pc.printf("loop set %.2f mbs %d us\n",8*BYTES/(float)us,us);
manitou 0:d374f051a3ac 67 us = tmr.read_us();
manitou 0:d374f051a3ac 68 for (i=0;i<BYTES;i++) dst[i] = src[i];
manitou 0:d374f051a3ac 69 us = tmr.read_us() - us;
manitou 0:d374f051a3ac 70 pc.printf("loop copy %.2f mbs %d us\n",8*BYTES/(float)us,us);
manitou 0:d374f051a3ac 71 us = tmr.read_us();
manitou 0:d374f051a3ac 72 memset(dst,0,BYTES);
manitou 0:d374f051a3ac 73 us = tmr.read_us() - us;
manitou 0:d374f051a3ac 74 pc.printf("memset %.2f mbs %d us\n",8*BYTES/(float)us,us);
manitou 0:d374f051a3ac 75 us = tmr.read_us();
manitou 0:d374f051a3ac 76 memcpy(dst,src,BYTES);
manitou 0:d374f051a3ac 77 us = tmr.read_us() - us;
manitou 0:d374f051a3ac 78 pc.printf("memcpy %.2f mbs %d us\n",8*BYTES/(float)us,us);
manitou 0:d374f051a3ac 79
manitou 0:d374f051a3ac 80 memset(dst,0,BYTES); // for validation
manitou 0:d374f051a3ac 81 us = tmr.read_us();
manitou 0:d374f051a3ac 82 memcpy128(dst,src,BYTES);
manitou 0:d374f051a3ac 83 us = tmr.read_us() - us;
manitou 0:d374f051a3ac 84 pc.printf("memcpy128 %.2f mbs %d us\n",8*BYTES/(float)us,us);
manitou 0:d374f051a3ac 85 int errs=0;
manitou 0:d374f051a3ac 86 for ( i=0;i<BYTES;i++) if (src[i] != dst[i]) errs++;
manitou 0:d374f051a3ac 87 pc.printf("errs %d\n",errs);
manitou 0:d374f051a3ac 88 }
manitou 0:d374f051a3ac 89
manitou 0:d374f051a3ac 90 int main() {
manitou 0:d374f051a3ac 91 wait(2.0);
manitou 0:d374f051a3ac 92 pc.printf("SystemCoreClock %d %s %s\n",SystemCoreClock,__TIME__,__DATE__);
manitou 0:d374f051a3ac 93 tmr.start();
manitou 0:d374f051a3ac 94 dma_init();
manitou 0:d374f051a3ac 95 while(1) {
manitou 0:d374f051a3ac 96 memperf();
manitou 0:d374f051a3ac 97 wait(3.0);
manitou 0:d374f051a3ac 98 }
manitou 0:d374f051a3ac 99
manitou 0:d374f051a3ac 100 }