Discussion:
x86: AVX-512 mbtree_fix8_pack and mbtree_fix8_unpack
(too old to reply)
Henrik Gramner
2017-12-25 19:39:56 UTC
Permalink
x264 | branch: master | Henrik Gramner <***@gramner.com> | Sat Oct 7 12:06:51 2017 +0200| [5b62ab59be01579ab37033cc86527df922efb843] | committer: Anton Mitrofanov

x86: AVX-512 mbtree_fix8_pack and mbtree_fix8_unpack

Takes advantage of opmasks to avoid having to use scalar code for the tail.

Also make some slight improvements to the checkasm test.
http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5b62ab59be01579ab37033cc86527df922efb843
---

common/x86/mc-a2.asm | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++-
common/x86/mc-c.c | 6 +++++
tools/checkasm.c | 10 +++++---
3 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index c437f5de..a4e11616 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -64,7 +64,8 @@ hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
-mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
+; bits 0-3: pshufb, bits 4-7: AVX-512 vpermq
+mbtree_fix8_pack_shuf: db 0x01,0x20,0x43,0x62,0x15,0x34,0x57,0x76,0x09,0x08,0x0b,0x0a,0x0d,0x0c,0x0f,0x0e

pf_256: times 4 dd 256.0
pf_inv16777216: times 4 dd 0x1p-24
@@ -2641,3 +2642,69 @@ INIT_XMM ssse3
MBTREE_FIX8
INIT_YMM avx2
MBTREE_FIX8
+
+%macro MBTREE_FIX8_AVX512_END 0
+ add r2, mmsize/2
+ jle .loop
+ cmp r2d, mmsize/2
+ jl .tail
+ RET
+.tail:
+ ; Do the final loop iteration with partial masking to handle the remaining elements.
+ shrx r3d, r3d, r2d ; (1 << count) - 1
+ kmovd k1, r3d
+ kshiftrd k2, k1, 16
+ jmp .loop
+%endmacro
+
+INIT_ZMM avx512
+cglobal mbtree_fix8_pack, 3,4
+ vbroadcastf32x4 m2, [pf_256]
+ vbroadcasti32x4 m3, [mbtree_fix8_pack_shuf]
+ psrld xm4, xm3, 4
+ pmovzxbq m4, xm4
+ sub r2d, mmsize/2
+ mov r3d, -1
+ movsxdifnidn r2, r2d
+ lea r1, [r1+4*r2]
+ lea r0, [r0+2*r2]
+ neg r2
+ jg .tail
+ kmovd k1, r3d
+ kmovw k2, k1
+.loop:
+ vmulps m0 {k1}{z}, m2, [r1+4*r2]
+ vmulps m1 {k2}{z}, m2, [r1+4*r2+mmsize]
+ cvttps2dq m0, m0
+ cvttps2dq m1, m1
+ packssdw m0, m1
+ pshufb m0, m3
+ vpermq m0, m4, m0
+ vmovdqu16 [r0+2*r2] {k1}, m0
+ MBTREE_FIX8_AVX512_END
+
+cglobal mbtree_fix8_unpack, 3,4
+ vbroadcasti32x8 m3, [mbtree_fix8_unpack_shuf]
+ vbroadcastf32x4 m2, [pf_inv16777216]
+ sub r2d, mmsize/2
+ mov r3d, -1
+ movsxdifnidn r2, r2d
+ lea r1, [r1+2*r2]
+ lea r0, [r0+4*r2]
+ neg r2
+ jg .tail
+ kmovw k1, r3d
+ kmovw k2, k1
+.loop:
+ mova m1, [r1+2*r2]
+ vshufi32x4 m0, m1, m1, q1100
+ vshufi32x4 m1, m1, m1, q3322
+ pshufb m0, m3
+ pshufb m1, m3
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ mulps m0, m2
+ mulps m1, m2
+ vmovaps [r0+4*r2] {k1}, m0
+ vmovaps [r0+4*r2+mmsize] {k2}, m1
+ MBTREE_FIX8_AVX512_END
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index e020fd50..51764811 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -308,10 +308,14 @@ void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, ui
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
#define x264_mbtree_fix8_pack_avx2 x264_template(mbtree_fix8_pack_avx2)
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
+#define x264_mbtree_fix8_pack_avx512 x264_template(mbtree_fix8_pack_avx512)
+void x264_mbtree_fix8_pack_avx512( uint16_t *dst, float *src, int count );
#define x264_mbtree_fix8_unpack_ssse3 x264_template(mbtree_fix8_unpack_ssse3)
void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
#define x264_mbtree_fix8_unpack_avx2 x264_template(mbtree_fix8_unpack_avx2)
void x264_mbtree_fix8_unpack_avx2 ( float *dst, uint16_t *src, int count );
+#define x264_mbtree_fix8_unpack_avx512 x264_template(mbtree_fix8_unpack_avx512)
+void x264_mbtree_fix8_unpack_avx512( float *dst, uint16_t *src, int count );

#define x264_mc_chroma_avx x264_template(mc_chroma_avx)
#define x264_mc_chroma_avx2 x264_template(mc_chroma_avx2)
@@ -1107,4 +1111,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
#if ARCH_X86_64
pf->mbtree_propagate_list = mbtree_propagate_list_avx512;
#endif
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx512;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx512;
}
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 5b1d6f7e..0fc4248b 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1799,6 +1799,8 @@ static int check_mc( int cpu_ref, int cpu_new )
}
}

+ static const uint16_t mbtree_fix8_counts[] = { 5, 384, 392, 400, 415 };
+
if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack )
{
set_func_name( "mbtree_fix8_pack" );
@@ -1806,9 +1808,9 @@ static int check_mc( int cpu_ref, int cpu_new )
float *fix8_src = (float*)(buf3 + 0x800);
uint16_t *dstc = (uint16_t*)buf3;
uint16_t *dsta = (uint16_t*)buf4;
- for( int i = 0; i < 5; i++ )
+ for( int i = 0; i < ARRAY_SIZE(mbtree_fix8_counts); i++ )
{
- int count = 256 + i;
+ int count = mbtree_fix8_counts[i];

for( int j = 0; j < count; j++ )
fix8_src[j] = (int16_t)(rand()) / 256.0f;
@@ -1833,9 +1835,9 @@ static int check_mc( int cpu_ref, int cpu_new )
uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800);
float *dstc = (float*)buf3;
float *dsta = (float*)buf4;
- for( int i = 0; i < 5; i++ )
+ for( int i = 0; i < ARRAY_SIZE(mbtree_fix8_counts); i++ )
{
- int count = 256 + i;
+ int count = mbtree_fix8_counts[i];

for( int j = 0; j < count; j++ )
fix8_src[j] = rand();

Loading...