Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions raid/pq_gen_avx2.asm
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,17 @@ default rel
[bits 64]
section .text

align 16
align 32
mk_global pq_gen_avx2, function
func(pq_gen_avx2)
FUNC_SAVE
sub vec, 3 ;Keep as offset to last source
jng return_fail ;Must have at least 2 sources
cmp len, 0
test len, len
je return_pass
test len, (32-1) ;Check alignment of length
test BYTE(len), (32-1) ;Check alignment of length
jnz return_fail
mov pos, 0
xor DWORD(pos), DWORD(pos)
vmovdqa xpoly, [poly]
vpxor xzero, xzero, xzero
cmp len, 96
Expand All @@ -146,7 +146,7 @@ len_aligned_32bytes:

loop96:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1, [ptr+pos] ;Preload last vector (source)
XLDR xs2, [ptr+pos+32] ;Preload last vector (source)
XLDR xs3, [ptr+pos+64] ;Preload last vector (source)
Expand All @@ -158,7 +158,6 @@ loop96:
vpxor xq3, xq3, xq3 ;q3 = 0

next_vect:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxor xq1, xq1, xs1 ; q1 ^= s1
vpxor xq2, xq2, xs2 ; q2 ^= s2
Expand All @@ -178,7 +177,8 @@ next_vect:
vpxor xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
vpxor xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
vpxor xq3, xq3, xtmp3 ; q3 = q3<<1 ^ poly_masked
jg next_vect ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jge next_vect ; Loop for each vect

mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
Expand Down Expand Up @@ -206,21 +206,21 @@ next_vect:

loop32:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1, [ptr+pos] ;Preload last vector (source)
vpxor xp1, xp1, xp1 ;p = 0
vpxor xq1, xq1, xq1 ;q = 0

next_vect32:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxor xq1, xq1, xs1 ; q1 ^= s1
vpblendvb xtmp1, xzero, xpoly, xq1 ; xtmp1 = poly or 0x00
vpxor xp1, xp1, xs1 ; p ^= s
vpaddb xq1, xq1, xq1 ; q = q<<1
vpxor xq1, xq1, xtmp1 ; q = q<<1 ^ poly_masked
XLDR xs1, [ptr+pos] ; Get next vector (source data)
jg next_vect32 ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jg next_vect32 ; Loop for each vect
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be jge like in some other places.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, missed that, thanks. Fixed


mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
Expand All @@ -234,7 +234,7 @@ next_vect32:


return_pass:
mov return, 0
xor DWORD(return), DWORD(return)
FUNC_RESTORE
ret

Expand Down
20 changes: 10 additions & 10 deletions raid/pq_gen_avx2_gfni.asm
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,15 @@ db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
[bits 64]
section .text

align 16
align 32
mk_global pq_gen_avx2_gfni, function
func(pq_gen_avx2_gfni)
FUNC_SAVE
sub vec, 3 ;Keep as offset to last source
jng return_fail ;Must have at least 2 sources
cmp len, 0
test len, len
je return_pass
test len, (32-1) ;Check alignment of length
test BYTE(len), (32-1) ;Check alignment of length
jnz return_fail

vmovdqa gfmatrix, [rel gf_matrix]
Expand All @@ -149,7 +149,7 @@ len_aligned_32bytes:

loop64:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1, [ptr+pos] ;Preload last vector (source)
XLDR xs2, [ptr+pos+32] ;Preload last vector (source)
vpxor xp1, xp1, xp1 ;p1 = 0
Expand All @@ -158,7 +158,6 @@ loop64:
vpxor xq2, xq2, xq2 ;q2 = 0

next_vect:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxor xq1, xq1, xs1 ; q1 ^= s1
vpxor xq2, xq2, xs2 ; q2 ^= s2
Expand All @@ -168,7 +167,8 @@ next_vect:
XLDR xs2, [ptr+pos+32] ; Get next vector (source data2)
vgf2p8affineqb xq1, xq1, gfmatrix, 0x00
vgf2p8affineqb xq2, xq2, gfmatrix, 0x00
jg next_vect ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jge next_vect ; Loop for each vect

mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
Expand All @@ -192,19 +192,19 @@ next_vect:

loop32:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1, [ptr+pos] ;Preload last vector (source)
vpxor xp1, xp1, xp1 ;p = 0
vpxor xq1, xq1, xq1 ;q = 0

next_vect32:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxor xq1, xq1, xs1 ; q1 ^= s1
vgf2p8affineqb xq1, xq1, gfmatrix, 0x00
vpxor xp1, xp1, xs1 ; p ^= s
XLDR xs1, [ptr+pos] ; Get next vector (source data)
jg next_vect32 ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jge next_vect32 ; Loop for each vect

mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
Expand All @@ -218,7 +218,7 @@ next_vect32:


return_pass:
mov return, 0
mov DWORD(return), DWORD(return)
FUNC_RESTORE
ret

Expand Down
46 changes: 23 additions & 23 deletions raid/pq_gen_avx512.asm
Original file line number Diff line number Diff line change
Expand Up @@ -120,29 +120,29 @@ default rel
[bits 64]
section .text

align 16
align 32
mk_global pq_gen_avx512, function
func(pq_gen_avx512)
FUNC_SAVE
sub vec, 3 ;Keep as offset to last source
jng return_fail ;Must have at least 2 sources
cmp len, 0
test len, len
je return_pass
test len, (32-1) ;Check alignment of length
test BYTE(len), (32-1) ;Check alignment of length
jnz return_fail
mov pos, 0
mov tmp, 0x1d
vpbroadcastb xpoly, tmp
vpxorq xzero, xzero, xzero
cmp len, 128
jl loop32
mov DWORD(pos), 0x1d1d1d1d
vpbroadcastd xpoly, DWORD(pos)
xor DWORD(pos), DWORD(pos)
vpxor xzeroy, xzeroy, xzeroy
cmp len, 127
jle loop32

len_aligned_32bytes:
sub len, 2*64 ;Len points to last block

loop128:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1, [ptr+pos] ;Preload last vector (source)
XLDR xs2, [ptr+pos+64] ;Preload last vector (source)
vpxorq xp1, xp1, xp1 ;p1 = 0
Expand All @@ -151,7 +151,6 @@ loop128:
vpxorq xq2, xq2, xq2 ;q2 = 0

next_vect:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxorq xq1, xq1, xs1 ; q1 ^= s1
vpxorq xq2, xq2, xs2 ; q2 ^= s2
Expand All @@ -167,7 +166,8 @@ next_vect:
vpaddb xq2, xq2, xq2 ; q2 = q2<<1
vpxorq xq1, xq1, xtmp1 ; q1 = q1<<1 ^ poly_masked
vpxorq xq2, xq2, xtmp2 ; q2 = q2<<1 ^ poly_masked
jg next_vect ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jge next_vect ; Loop for each vect

mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
Expand All @@ -191,26 +191,26 @@ next_vect:

loop32:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1y, [ptr+pos] ;Preload last vector (source)
vpxorq xp1y, xp1y, xp1y ;p = 0
vpxorq xq1y, xq1y, xq1y ;q = 0
vpxor xp1y, xp1y, xp1y ;p = 0
vpxor xq1y, xq1y, xq1y ;q = 0

next_vect32:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxorq xq1y, xq1y, xs1y ; q1 ^= s1
vpxor xq1y, xq1y, xs1y ; q1 ^= s1
vpblendvb xtmp1y, xzeroy, xpolyy, xq1y ; xtmp1 = poly or 0x00
vpxorq xp1y, xp1y, xs1y ; p ^= s
vpxor xp1y, xp1y, xs1y ; p ^= s
vpaddb xq1y, xq1y, xq1y ; q = q<<1
vpxorq xq1y, xq1y, xtmp1y ; q = q<<1 ^ poly_masked
vpxor xq1y, xq1y, xtmp1y ; q = q<<1 ^ poly_masked
XLDR xs1y, [ptr+pos] ; Get next vector (source data)
jg next_vect32 ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jg next_vect32 ; Loop for each vect

mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
vpxorq xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded
vpxorq xq1y, xq1y, xs1y ;q ^= 1 * s[0]
vpxor xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded
vpxor xq1y, xq1y, xs1y ;q ^= 1 * s[0]
XSTR [ptr+pos], xp1y ;Write parity P vector
XSTR [tmp+pos], xq1y ;Write parity Q vector
add pos, 32
Expand All @@ -219,7 +219,7 @@ next_vect32:


return_pass:
mov return, 0
xor DWORD(return), DWORD(return)
FUNC_RESTORE
ret

Expand Down
48 changes: 22 additions & 26 deletions raid/pq_gen_avx512_gfni.asm
Original file line number Diff line number Diff line change
Expand Up @@ -116,53 +116,48 @@
; 0 0 0 0 0 0 0 1
; 1 0 0 0 0 0 0 0
default rel
section .data
align 64
gf_matrix:
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80
db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80


[bits 64]
section .text

align 16
align 32
mk_global pq_gen_avx512_gfni, function
func(pq_gen_avx512_gfni)
FUNC_SAVE
sub vec, 3 ;Keep as offset to last source
jng return_fail ;Must have at least 2 sources
cmp len, 0
test len, len
je return_pass
test len, (32-1) ;Check alignment of length
test BYTE(len), (32-1) ;Check alignment of length
jnz return_fail

vmovdqa64 gfmatrix, [rel gf_matrix]
vpbroadcastq gfmatrix, [rel gf_matrix]

xor pos, pos
cmp len, 128
jl loop32
cmp len, 127
jle loop32

len_aligned_32bytes:
sub len, 2*64 ;Len points to last block

align 16
loop128:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1, [ptr+pos] ;Preload last vector (source)
XLDR xs2, [ptr+pos+64] ;Preload last vector (source)
vpxorq xp1, xp1, xp1 ;p1 = 0
vpxorq xp2, xp2, xp2 ;p2 = 0
vpxorq xq1, xq1, xq1 ;q1 = 0
vpxorq xq2, xq2, xq2 ;q2 = 0

align 16
next_vect:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxorq xq1, xq1, xs1 ; q1 ^= s1
vpxorq xq2, xq2, xs2 ; q2 ^= s2
Expand All @@ -172,7 +167,8 @@ next_vect:
XLDR xs2, [ptr+pos+64] ; Get next vector (source data2)
vgf2p8affineqb xq1, xq1, gfmatrix, 0x00
vgf2p8affineqb xq2, xq2, gfmatrix, 0x00
jg next_vect ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jge next_vect ; Loop for each vect

mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
Expand All @@ -196,24 +192,24 @@ next_vect:

loop32:
mov ptr, [arg2+vec*8] ;Fetch last source pointer
mov tmp, vec ;Set tmp to point back to last vector
lea tmp, [vec-1] ;Set tmp to point back to last vector
XLDR xs1y, [ptr+pos] ;Preload last vector (source)
vpxorq xp1y, xp1y, xp1y ;p = 0
vpxorq xq1y, xq1y, xq1y ;q = 0
vpxor xp1y, xp1y, xp1y ;p = 0
vpxor xq1y, xq1y, xq1y ;q = 0

next_vect32:
sub tmp, 1 ;Inner loop for each source vector
mov ptr, [arg2+tmp*8] ; get pointer to next vect
vpxorq xq1y, xq1y, xs1y ; q1 ^= s1
vpxor xq1y, xq1y, xs1y ; q1 ^= s1
vgf2p8affineqb xq1y, xq1y, gfmatrixy, 0x00
vpxorq xp1y, xp1y, xs1y ; p ^= s
vpxor xp1y, xp1y, xs1y ; p ^= s
XLDR xs1y, [ptr+pos] ; Get next vector (source data)
jg next_vect32 ; Loop for each vect except 0
sub tmp, 1 ;Inner loop for each source vector
jge next_vect32 ; Loop for each vect

mov ptr, [arg2+8+vec*8] ;Get address of P parity vector
mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector
vpxorq xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded
vpxorq xq1y, xq1y, xs1y ;q ^= 1 * s[0]
vpxor xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded
vpxor xq1y, xq1y, xs1y ;q ^= 1 * s[0]
XSTR [ptr+pos], xp1y ;Write parity P vector
XSTR [tmp+pos], xq1y ;Write parity Q vector
add pos, 32
Expand All @@ -222,7 +218,7 @@ next_vect32:


return_pass:
mov return, 0
xor DWORD(return), DWORD(return)
FUNC_RESTORE
ret

Expand Down
Loading