循环解开优化的实现(例5),意味着只需少许努力,就可大幅提升性能,每次调用所需周期减少至2711,整体系统提升10倍。如果在使用32-MAC扩展指令时,一起使用上述技巧,函数可在687个周期内执行完毕,相对于最初直接用C代码,性能提升超过39倍(见表1)。
例5:
/*包含特定的扩展指令头文件*/ #include "fir8.h"
#define ST_DECR 1 /*
减量指示器 */ #define ST_INCR 0 /* 增量指示器 */ #define FIR(h1, h2, h3, h4,
h5, h6, h7, h8, x1, x2, y1, X) \ { \ WRGET0I( &(h1), 8 *
sizeof(short) ); \ WRGET1I( &(x1), 16 ); \ X++ ;
\ WRGET0I( &(h2), 16 ); \ WRGET1I( &(x2), 16 );
\ FIR_MUL( (x1), (h1), &(y1) ); \ \ WRGET0I( &(h3), 16
); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x2), (h2), &(y1) );
\ WRGET0I( &(h4), 16 ); \ WRGET1I( &(x2), 16 );
\ FIR_MAC( (x1), (h3), &(y1) ); \ WRGET0I( &(h5), 16 );
\ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x2), (h4), &(y1) );
\ WRGET0I( &(h6), 16 ); \ WRGET1I( &(x2), 16 );
\ FIR_MAC( (x1), (h5), &(y1) ); \ WRGET0I( &(h7), 16 );
\ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x2), (h6), &(y1) );
\ WRGET0I( &(h8), 16 ); \ WRGET1I( &(x2), 16 );
\ FIR_MAC( (x1), (h7), &(y1) ); \ WRGET1INIT(ST_DECR, X);
\ FIR_MAC( (x2), (h8), &(y1) ); \ } #define FIR1(h1, h2, h3,
h4, h5, h6, h7, h8, x1, x2, y1, y2, X) \ { \ WRGET1I( &(x1), 16
); \ FIR_MUL( (x1), (h1), &(y2) ); \ WRGET1I( &(x1), 16 );
\ FIR_MAC( (x1), (h2), &(y2) ); \ WRGET1I( &(x1), 16 );
\ FIR_MAC( (x1), (h3), &(y2) ); \ WRGET1I( &(x1), 16 );
\ FIR_MAC( (x1), (h4), &(y2) ); \ WRGET1I( &(x1), 16 );
\ X++ ; \ FIR_MAC( (x1), (h5), &(y2) ); \ WRGET1I(
&(x1), 16 ); \ WRGET1I( &(x2), 16 ); \ FIR_MAC( (x1),
(h6), &(y2) ); \ WRGET1I( &(x1), 16 );
\ WRGET1INIT0(ST_DECR, X); \ FIR_MAC( (x2), (h7), &(y2) );
\ WRGET1INIT1(); \ WRPUTI(y1, 2); \ FIR_MAC( (x1), (h8),
&(y2) ); \ } #define FIR2(h1, h2, h3, h4, h5, h6, h7, h8, x1,
x2, y1, y2, X) \ { \ WRGET1I( &(x1), 16 ); \ FIR_MUL( (x1),
(h1), &(y1) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1),
(h2), &(y1) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1),
(h3), &(y1) ); \ WRGET1I( &(x1), 16 ); \ FIR_MAC( (x1),
(h4), &(y1) ); \ WRGET1I( &(x1), 16 ); \ X++ ;
\ FIR_MAC( (x1), (h5), &(y1) ); \ WRGET1I( &(x1), 16 );
\ WRGET1I( &(x2), 16 ); \ FIR_MAC( (x1), (h6), &(y1) );
\ WRGET1I( &(x1), 16 ); \ WRGET1INIT0(ST_DECR, X) ;
\ FIR_MAC( (x2), (h7), &(y1) ); \ WRGET1INIT1();
\ WRPUTI(y2, 2); \ FIR_MAC( (x1), (h8), &(y1) );
\ } /* * -在ISEF中,FIR使用8倍乘法循环优化 / 手工展开 */ void fir(short *X,
short *H, short *Y, short N, short T) { int n, t, t8 ; WR h1,
h2, h3, h4, h5, h6, h7, h8 ; WR x1, x2; WR y1; WR y2; //
(these alternative "register" declarations make no difference:) //
register WR y1 SE_REG("wra1") ; // register WR y2 SE_REG("wra2")
; WRPUTINIT(ST_INCR, Y); /* 起始输出流 */ WRGET0INIT(ST_INCR, H); /*
起始系数流 */ X++ ; WRGET1INIT(ST_DECR, X); /* 起始输入流 */ /* compute
Y[0] in y1 */ FIR(h1, h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, X)
; /* loop ((N/2)-1) times */ for (n = 0; n < ((N>>1)-1);
n++) { /* FIR1 输出前一个(y1)的结果,并计算当前(y2)的结果 */ FIR1(h1, h2, h3,
h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ; /* FIR1
输出前一个(y2)的结果,并计算当前(y1)的结果 */ FIR2(h1, h2, h3, h4, h5, h6, h7, h8, x1,
x2, y1, y2, X) ; } /* 在y2中计算Y[N-1],并从y1中输出Y[N-2] */ FIR1(h1,
h2, h3, h4, h5, h6, h7, h8, x1, x2, y1, y2, X) ; WRPUTI(y2, 2) ; /*
输出U[N-1] */ WRPUTFLUSH0() ; /* 清除输出流 */ WRPUTFLUSH1() ; /* 清除输出流
*/ } |
 表1: | 通过在应用型处理器和流水线中集成可编程逻辑,以软件方式配置的架构,只需少许手工优化,在运算密集的算法中,也能提供实质上硬件加速的效果。正是因为可把"硬件设计成软件",开发者现在能避免在硬件与软件之间,由于算法分割上的实现,而带来的复杂性了,并且编译器处理了实际硬件实现上的复杂性,开发者现在能快速、轻松地设计更复杂的算法,评估各种实现的效率,以最大化地提升性能及控制成本。
|