Discussion:
[x264-devel] x86: AVX-512 add8x8_idct
Henrik Gramner
2017-06-26 19:58:59 UTC
Permalink
x264 | branch: master | Henrik Gramner <***@gramner.com> | Thu Jun 1 22:13:19 2017 +0200| [0af1c6d0d0cc54ba4f888db39247774edcf19b44] | committer: Henrik Gramner

x86: AVX-512 add8x8_idct
http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=0af1c6d0d0cc54ba4f888db39247774edcf19b44
---

common/dct.c | 1 +
common/x86/dct-a.asm | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
common/x86/dct.h | 1 +
3 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 0d7f96de..1b2a2ea6 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -717,6 +717,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
+ dctf->add8x8_idct = x264_add8x8_idct_avx512;
}
#endif //HAVE_MMX

diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 42af7c63..dd8e357d 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -47,10 +47,10 @@ cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4:
dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
%else
-dct_avx512: dd 0x00000000, 0x00021104, 0x0006314c, 0x00042048 ; bits 0-4: dct8x8_fenc
- dd 0x00008a10, 0x00029b14, 0x0006bb5c, 0x0004aa58 ; bits 5-9: dct8x8_fdec
- dd 0x00004421, 0x00025525, 0x0006756d, 0x00046469 ; bits 10-13: dct16x16_fenc
- dd 0x0000ce31, 0x0002df35, 0x0006ff7d, 0x0004ee79 ; bits 14-18: dct16x16_fdec
+dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec
+ dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec
+ dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2
+ dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
@@ -724,6 +724,49 @@ cglobal sub16x16_dct
SUB4x16_DCT_AVX512 4, 2
SUB4x16_DCT_AVX512 5, 3
RET
+
+%macro SARSUMSUB 3 ; a, b, tmp
+ mova m%3, m%1
+ vpsraw m%1 {k1}, 1
+ psubw m%1, m%2 ; 0-2 1>>1-3
+ vpsraw m%2 {k1}, 1
+ paddw m%2, m%3 ; 0+2 1+3>>1
+%endmacro
+
+cglobal add8x8_idct, 2,2
+ mova m1, [r1]
+ mova m2, [r1+64]
+ mova m3, [dct_avx512]
+ vbroadcasti32x4 m4, [pw_32]
+ mov r1d, 0xf0f0f0f0
+ kxnorb k2, k2, k2
+ kmovd k1, r1d
+ kmovb k3, k2
+ vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d
+ vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f
+ psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE
+ vpgatherqq m6 {k2}, [r0+m5]
+ SARSUMSUB 0, 1, 2
+ SBUTTERFLY wd, 1, 0, 2
+ psrlq m7, m3, 28
+ SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3
+ vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1
+ SBUTTERFLY dq, 0, 1, 2
+ psrlq m3, 24
+ SARSUMSUB 0, 1, 2
+ vpermi2q m3, m1, m0
+ vpermt2q m1, m7, m0
+ paddw m3, m4 ; += 32
+ SUMSUB_BA w, 1, 3, 0
+ psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3'
+ psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3'
+ pxor xm0, xm0
+ SBUTTERFLY bw, 6, 0, 2
+ paddsw m1, m6
+ paddsw m3, m0
+ packuswb m1, m3
+ vpscatterqq [r0+m5] {k3}, m1
+ RET
%endif ; HIGH_BIT_DEPTH

INIT_MMX
diff --git a/common/x86/dct.h b/common/x86/dct.h
index c30b0daa..e173c1fd 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -62,6 +62,7 @@ void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
+void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
Loading...