Discussion:
[x264-devel] x86: AVX-512 pixel_avg_weight_w8
Henrik Gramner
2017-06-26 19:59:14 UTC
Permalink
x264 | branch: master | Henrik Gramner <***@gramner.com> | Sat Jun 24 15:12:57 2017 +0200| [ba24899b0bf23345921da022f7a51e0c57dbe73d] | committer: Henrik Gramner

x86: AVX-512 pixel_avg_weight_w8
http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=ba24899b0bf23345921da022f7a51e0c57dbe73d
---

common/x86/mc-a.asm | 35 +++++++++++++++++++++++++++++++++++
common/x86/mc-c.c | 3 +++
2 files changed, 38 insertions(+)

diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 2dbdee5d..3c1d2145 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -276,6 +276,38 @@ cglobal pixel_avg_weight_w16
vextracti128 [t0+t1], m0, 1
AVG_END

+INIT_YMM avx512
+cglobal pixel_avg_weight_w8
+ BIWEIGHT_START
+ kxnorb k1, k1, k1
+ kaddb k1, k1, k1
+ AVG_START 5
+.height_loop:
+ movq xm0, [t2]
+ movq xm2, [t4]
+ movq xm1, [t2+t3]
+ movq xm5, [t4+t5]
+ lea t2, [t2+t3*2]
+ lea t4, [t4+t5*2]
+ vpbroadcastq m0 {k1}, [t2]
+ vpbroadcastq m2 {k1}, [t4]
+ vpbroadcastq m1 {k1}, [t2+t3]
+ vpbroadcastq m5 {k1}, [t4+t5]
+ punpcklbw m0, m2
+ punpcklbw m1, m5
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ vextracti128 xmm1, m0, 1
+ movq [t0], xm0
+ movhps [t0+t1], xm0
+ lea t0, [t0+t1*2]
+ movq [t0], xmm1
+ movhps [t0+t1], xmm1
+ AVG_END 4
+
INIT_ZMM avx512
cglobal pixel_avg_weight_w16
BIWEIGHT_START
@@ -776,6 +808,9 @@ AVGH 16, 8
INIT_XMM avx512
AVGH 16, 16
AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4

%endif ;HIGH_BIT_DEPTH

diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 0a7e414c..c06691c9 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -871,6 +871,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512;
}
#endif // HIGH_BIT_DEPTH

Loading...