@@ -59,6 +59,82 @@ double fdotp_v64b(const double *a, const double *b, unsigned int avl) {
5959 return red ;
6060}
6161
62+ // 64-bit dot-product: a * b
63+ // m8 allows only for partial register re-allocation with factor-2 unrolling
64+ double fdotp_v64b_m8_unrl (const double * a , const double * b , unsigned int avl ) {
65+ const unsigned int orig_avl = avl ;
66+ unsigned int vl ;
67+
68+ double red ;
69+
70+ // Stripmine and accumulate a partial reduced vector
71+ do {
72+ // Set the vl
73+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
74+
75+ // Load chunk a and b
76+ asm volatile ("vle64.v v8, (%0)" ::"r" (a ));
77+ asm volatile ("vle64.v v16, (%0)" ::"r" (b ));
78+
79+ // Multiply and accumulate
80+ if (avl == orig_avl ) {
81+ asm volatile ("vfmul.vv v24, v8, v16" );
82+ } else {
83+ asm volatile ("vfmacc.vv v24, v8, v16" );
84+ }
85+
86+ // Bump pointers
87+ a += vl ;
88+ b += vl ;
89+ avl -= vl ;
90+
91+ if (avl <= 0 )
92+ break ;
93+
94+ // Set the vl
95+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
96+
97+ // Load chunk a and b
98+ asm volatile ("vle64.v v0, (%0)" ::"r" (a ));
99+ asm volatile ("vle64.v v8, (%0)" ::"r" (b ));
100+
101+ // Multiply and accumulate
102+ asm volatile ("vfmacc.vv v24, v0, v8" );
103+
104+ // Bump pointers
105+ a += vl ;
106+ b += vl ;
107+ avl -= vl ;
108+
109+ if (avl <= 0 )
110+ break ;
111+
112+ // Set the vl
113+ asm volatile ("vsetvli %0, %1, e64, m8, ta, ma" : "=r" (vl ) : "r" (avl ));
114+
115+ // Load chunk a and b
116+ asm volatile ("vle64.v v16, (%0)" ::"r" (a ));
117+ asm volatile ("vle64.v v0, (%0)" ::"r" (b ));
118+
119+ // Multiply and accumulate
120+ asm volatile ("vfmacc.vv v24, v0, v16" );
121+
122+ // Bump pointers
123+ a += vl ;
124+ b += vl ;
125+ avl -= vl ;
126+ } while (avl > 0 );
127+
128+ // Clean the accumulator
129+ asm volatile ("vmv.s.x v0, zero" );
130+
131+ // Reduce and return
132+ asm volatile ("vfredusum.vs v0, v24, v0" );
133+ asm volatile ("vfmv.f.s %0, v0" : "=f" (red ));
134+
135+ return red ;
136+ }
137+
62138// 32-bit dot-product: a * b
63139float fdotp_v32b (const float * a , const float * b , unsigned int avl ) {
64140 const unsigned int orig_avl = avl ;
0 commit comments