Teensy 3.1 DMA memcpy() 1092mbs 16-byte aligned DMA

Dependencies:   USBDevice mbed

Teensy 3.1 DMA memcpy, proof of concept

2048 bytes aligned 16
loop set 215.58 mbs 76 us
loop copy 910.22 mbs 18 us
memset 1365.33 mbs 12 us
memcpy 910.22 mbs 18 us
memcpy128 1092.27 mbs 15 us    DMA
errs 0

Obviously, the ARMCC memcpy() is quite fast (unrolled assembler I presume).

You could add IRQ handler if you wanted asynch operations.

main.cpp

Committer:
manitou
Date:
2015-10-03
Revision:
0:d374f051a3ac

File content as of revision 0:d374f051a3ac:

// teensy 3.1 mbed memcpy using DMA
//  could add IRQ handler for asynch operation
#include "mbed.h"
#include "USBSerial.h"

#define PRREG(x) pc.printf(#x" 0x%0x\n",x)

USBSerial  pc;          // Virtual serial port over USB
Timer tmr;

#define CHNL 1
void dma_init() {
    SIM->SCGC7 |= SIM_SCGC7_DMA_MASK;       // DMA clock
 //   SIM->SCGC6 |= SIM_SCGC6_DMAMUX_MASK;    // Enable clock to DMA mux
 //   DMAMUX->CHCFG[CHNL] = 0; // IO to DMA map
}

void memcpy32(void *dest,  void *src, unsigned int bytes)
{
  DMA0->TCD[CHNL].SADDR = (uint32_t)src;
  DMA0->TCD[CHNL].SOFF = 4;
  DMA0->TCD[CHNL].ATTR = DMA_ATTR_SSIZE(2) | DMA_ATTR_DSIZE(2);  //32-bit
  DMA0->TCD[CHNL].NBYTES_MLNO = bytes;
  DMA0->TCD[CHNL].SLAST = 0;
  DMA0->TCD[CHNL].DADDR = (uint32_t)dest;
  DMA0->TCD[CHNL].DOFF = 4;
  DMA0->TCD[CHNL].CITER_ELINKNO = 1;
  DMA0->TCD[CHNL].DLAST_SGA = 0;
  DMA0->TCD[CHNL].BITER_ELINKNO = 1;
  DMA0->TCD[CHNL].CSR = DMA_CSR_START_MASK;

  while (!(DMA0->TCD[CHNL].CSR & DMA_CSR_DONE_MASK)) /* wait */ ;
}

void memcpy128(void *dest,  void *src, unsigned int bytes)
{
  DMA0->TCD[CHNL].SADDR = (uint32_t)src;
  DMA0->TCD[CHNL].SOFF = 16;
  DMA0->TCD[CHNL].ATTR = DMA_ATTR_SSIZE(4) | DMA_ATTR_DSIZE(4);
  DMA0->TCD[CHNL].NBYTES_MLNO = bytes;
  DMA0->TCD[CHNL].SLAST = 0;
  DMA0->TCD[CHNL].DADDR = (uint32_t)dest;
  DMA0->TCD[CHNL].DOFF = 16;
  DMA0->TCD[CHNL].CITER_ELINKNO = 1;
  DMA0->TCD[CHNL].DLAST_SGA = 0;
  DMA0->TCD[CHNL].BITER_ELINKNO = 1;
  DMA0->TCD[CHNL].CSR = DMA_CSR_START_MASK;

  while (!(DMA0->TCD[CHNL].CSR & DMA_CSR_DONE_MASK)) /* wait */ ;
}

#define BYTES 2048

uint8_t src[BYTES] __attribute__ ((aligned (16)));
uint8_t dst[BYTES] __attribute__ ((aligned (16)));


void memperf(){
    int i;
    uint32_t us;
    
    pc.printf("\n%d bytes aligned 16\n",BYTES);
    us = tmr.read_us();
    for (i=0;i<BYTES;i++) src[i] = i;
    us = tmr.read_us() - us;
    pc.printf("loop set %.2f mbs %d us\n",8*BYTES/(float)us,us);
    us = tmr.read_us();
    for (i=0;i<BYTES;i++) dst[i] = src[i];
    us = tmr.read_us() - us;
    pc.printf("loop copy %.2f mbs %d us\n",8*BYTES/(float)us,us);
    us = tmr.read_us();
    memset(dst,0,BYTES);
    us = tmr.read_us() - us;
    pc.printf("memset %.2f mbs %d us\n",8*BYTES/(float)us,us);
    us = tmr.read_us();
    memcpy(dst,src,BYTES);
    us = tmr.read_us() - us;
    pc.printf("memcpy %.2f mbs %d us\n",8*BYTES/(float)us,us);

    memset(dst,0,BYTES);  // for validation
    us = tmr.read_us();
    memcpy128(dst,src,BYTES);
    us = tmr.read_us() - us;
    pc.printf("memcpy128 %.2f mbs %d us\n",8*BYTES/(float)us,us);
    int errs=0;
    for ( i=0;i<BYTES;i++) if (src[i] != dst[i]) errs++;
    pc.printf("errs %d\n",errs);
}

int main() {
    wait(2.0);
    pc.printf("SystemCoreClock %d  %s %s\n",SystemCoreClock,__TIME__,__DATE__);
    tmr.start();
    dma_init();
    while(1) {
        memperf();
        wait(3.0);
    }

}