AM335x裸机开发: 读DDR为什么比写慢很多?
开发硬件: AM3352,SPI0.SPIFLASH,DDR3@400MHz/800MTS;
开发软件: CCS; 参考AM335X_StarterWare,建立CCS裸机工程;SPL工程加载APP工程到DDR里运行;
已经使能MMU,打开CACHE(包含L2开启)
测试结果: 读DDR比写DDR要慢很多;
测试流程: 16M字节DDR,连续测试10次; 分正序(cache命中高)和倒序(cache命中低)测试;
测试结果: 发现读DDR比写DDR要慢很多?
同样在E2E上搜到类似https://e2echina.ti.com/question_answer/dsp_arm/sitara_arm/f/25/p/83049/209518
,但没有结果. 麻烦TI FAE分析或测试下;
32bit Write Test:
32bit 正序Write 160 MB,use 126 Ms; Spd = 1269 MPS. SUM=0xFEC00000
32bit 倒序Write 160 MB,use 164 Ms; Spd = 975 MPS. SUM=0x1400000
32bit read Test:
32bit 正序read 160 MB,use 231 Ms; Spd = 692 MPS. SUM=0xFEC00000
32bit 倒序read 160 MB,use 933 Ms; Spd = 171 MPS. SUM=0x1400000
8bit Write Test:
8bit 倒序Write 160 MB,use 785 Ms; Spd = 203 MPS. SUM=0x5000000
8bit 正序Write 160 MB,use 811 Ms; Spd = 197 MPS. SUM=0xFB000000
8bit Read Test:
8bit 倒序Read 160 MB,use 1792 Ms; Spd = 89 MPS. SUM=0xFB000000
8bit 正序Read 160 MB,use 1089 Ms; Spd = 146 MPS. SUM=0xFB000000
谢谢!
测试代码:
void Ddr3SpdTst(unsigned int iStartAddr, unsigned int iSizeBytes )
{
unsigned int i,j,tSize, tBgnMs,tEndMs,tMB,tSum;
unsigned int *p32Dst;
unsigned char *p8Dst;
//——————————————————–
UartPrintf("\n32bit Write Test:\n");
//———— 32bit Write 正序
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p32Dst = (unsigned int *)iStartAddr;
tSize = iSizeBytes/4;
for(i=0;i<tSize;i++)
{
p32Dst[i]=i; // 正序写;
tSum+=i;
tMB+=4;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB /(1024U * 1024U);
UartPrintf("32bit 正序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);
//————
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p32Dst = (unsigned int *)iStartAddr;
tSize = iSizeBytes/4;
for(i=tSize;i!=0;i–)
{
p32Dst[i]=i; // 倒序写;
tSum+=i;
tMB+=4;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB/(1024U * 1024U);
UartPrintf("32bit 倒序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);
UartPrintf("\n32bit read Test:\n");
//————
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p32Dst = (unsigned int *)iStartAddr;
tSize = iSizeBytes/4;
for(i=0;i<tSize;i++)
{
tSum +=p32Dst[i]; // 正序读;
tMB+=4;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB/(1024U * 1024U);
UartPrintf("32bit 正序read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs),tSum );
//————
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p32Dst = (unsigned int *)iStartAddr;
tSize = iSizeBytes/4;
for(i=tSize;i!=0;i–)
{
tSum +=p32Dst[i]; // 倒序读;
tMB+=4;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB/(1024U * 1024U);
UartPrintf("32bit 倒序read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs),tSum );
//——————————————————–
UartPrintf("\n8bit Write Test:\n");
//————
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p8Dst = (unsigned char *)iStartAddr;
tSize = iSizeBytes;
for(i=tSize;i!=0;i–)
{
p8Dst[i]=(unsigned char)i; // 倒序写;
tSum+=i;
tMB+=1;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB/(1024U * 1024U);
UartPrintf("8bit 倒序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);
//————
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p8Dst = (unsigned char *)iStartAddr;
tSize = iSizeBytes;
for(i=0;i<tSize;i++)
{
p8Dst[i]=(unsigned char)i; // 正序写;
tSum+=i;
tMB+=1;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB/(1024U * 1024U);
UartPrintf("8bit 正序Write %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);
//————
UartPrintf("\n8bit Read Test:\n");
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p8Dst = (unsigned char *)iStartAddr;
tSize = iSizeBytes;
for(i=tSize;i!=0;i–)
{
tSum+=p8Dst[i]; // 倒序
tMB+=1;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB/(1024U * 1024U);
UartPrintf("8bit 倒序Read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs) ,tSum);
//————
tBgnMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tSum=0;
tMB=0;
for(j=0;j<10;j++) // 循环测试10次;
{
p8Dst = (unsigned char *)iStartAddr;
tSize = iSizeBytes;
for(i=0;i<tSize;i++)
{
tSum+=p8Dst[i]; // 正序
tMB+=1;
}
}
tEndMs = gGlobalT12; // 记录开始时间; 单位Ms; 1ms定时器中断里有更新;
tMB = tMB/(1024U * 1024U);
UartPrintf("8bit 正序Read %d MB,use %d Ms; Spd = %d MPS. SUM=0x%X\n", tMB,(tEndMs-tBgnMs),(tMB*1000u)/(tEndMs-tBgnMs),tSum );
Shine:
请问DDR也cache使能了么?
processors.wiki.ti.com/…/Common_Issue_Resulting_in_Slow_External_Memory_Performance
user4467014:
回复 Shine:
谢谢回复, 您说的是MMU配置里的这个吗?
REGION regionDdr = {MMU_PGTYPE_SECTION, START_ADDR_DDR, NUM_SECTIONS_DDR,MMU_MEMTYPE_NORMAL_NON_SHAREABLE(MMU_CACHE_WT_NOWA,MMU_CACHE_WB_WA),MMU_REGION_NON_SECURE, MMU_AP_PRV_RW_USR_RW,(unsigned int*)pageTable};
Shine:
回复 user4467014:
是这里配。
Shine:
回复 Shine:
请关注下面的帖子。
e2e.ti.com/…/798714
user4467014:
回复 Shine:
谢谢!
user4467014:
回复 user4467014:
英文E2E https://e2e.ti.com/support/processors/f/791/t/798714 有回复:
修改为: MMU_MEMTYPE_NORMAL_NON_SHAREABLE(MMU_CACHE_WB_WA, MMU_CACHE_WB_WA)后,测试结果几乎没有变化;
修改为:MMU_MEMTYPE_NORMAL_NON_SHAREABLE(MMU_CACHE_WT_NOWA, MMU_CACHE_WT_NOWA),测试结果更差;
DMA方式不适合,因为不是读取固定位置的数据;
谢谢!
yongqing wang:
回复 user4467014:
有没有在最先的linux SDK上试试?
user4467014:
回复 yongqing wang:
感谢回复!
工程是基于StarterWare的裸机工程,没有跑linux;
谢谢!