Skip to content

Commit d1de282

Browse files
authored
Improve the precision of S/CNRM2 by summing in double precision
1 parent e5aebea commit d1de282

File tree

1 file changed

+10
-37
lines changed

1 file changed

+10
-37
lines changed

kernel/arm64/nrm2.S

Lines changed: 10 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535
#define I x3
3636

3737
#if !defined(DOUBLE)
38-
#define SSQ s0
39-
#define SCALE s1
40-
#define REGZERO s5
41-
#define REGONE s6
42-
#else
38+
#define SSQF s0
39+
#endif
40+
4341
#define SSQ d0
4442
#define SCALE d1
4543
#define REGZERO d5
4644
#define REGONE d6
47-
#endif
4845

4946
/*******************************************************************************
5047
* Macro definitions
@@ -53,22 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5350
.macro KERNEL_F1
5451
#if !defined(DOUBLE)
5552
ldr s4, [X], #4
56-
fcmp s4, REGZERO
57-
beq 2f /* KERNEL_F1_NEXT_\@ */
58-
fabs s4, s4
59-
fcmp SCALE, s4
60-
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
61-
fdiv s2, SCALE, s4
62-
fmul s2, s2, s2
63-
fmul s3, SSQ, s2
64-
fadd SSQ, REGONE, s3
65-
fmov SCALE, s4
66-
b 2f /* KERNEL_F1_NEXT_\@ */
67-
1: /* KERNEL_F1_SCALE_GE_X_\@: */
68-
fdiv s2, s4, SCALE
69-
fmla SSQ, s2, v2.s[0]
53+
fcvt d4, s4
7054
#else
7155
ldr d4, [X], #8
56+
#endif
7257
fcmp d4, REGZERO
7358
beq 2f /* KERNEL_F1_NEXT_\@ */
7459
fabs d4, d4
@@ -83,29 +68,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8368
1: /* KERNEL_F1_SCALE_GE_X_\@: */
8469
fdiv d2, d4, SCALE
8570
fmla SSQ, d2, v2.d[0]
86-
#endif
8771
2: /* KERNEL_F1_NEXT_\@: */
8872
.endm
8973

9074
.macro KERNEL_S1
9175
#if !defined(DOUBLE)
9276
ldr s4, [X]
93-
fcmp s4, REGZERO
94-
beq KERNEL_S1_NEXT
95-
fabs s4, s4
96-
fcmp SCALE, s4
97-
bge KERNEL_S1_SCALE_GE_X
98-
fdiv s2, SCALE, s4
99-
fmul s2, s2, s2
100-
fmul s3, SSQ, s2
101-
fadd SSQ, REGONE, s3
102-
fmov SCALE, s4
103-
b KERNEL_S1_NEXT
104-
KERNEL_S1_SCALE_GE_X:
105-
fdiv s2, s4, SCALE
106-
fmla SSQ, s2, v2.s[0]
77+
fcvt d4, s4
10778
#else
10879
ldr d4, [X]
80+
#endif
10981
fcmp d4, REGZERO
11082
beq KERNEL_S1_NEXT
11183
fabs d4, d4
@@ -120,7 +92,6 @@ KERNEL_S1_SCALE_GE_X:
12092
KERNEL_S1_SCALE_GE_X:
12193
fdiv d2, d4, SCALE
12294
fmla SSQ, d2, v2.d[0]
123-
#endif
12495
KERNEL_S1_NEXT:
12596
add X, X, INC_X
12697
.endm
@@ -218,7 +189,9 @@ KERNEL_S1_NEXT:
218189
.Lnrm2_kernel_L999:
219190
fsqrt SSQ, SSQ
220191
fmul SSQ, SCALE, SSQ
221-
192+
#if !defined(DOUBLE)
193+
fcvt SSQF, SSQ
194+
#endif
222195
ret
223196

224197
EPILOGUE

0 commit comments

Comments
 (0)