Discussion:
x86: AVX-512 load_deinterleave_chroma_fenc
(too old to reply)
Henrik Gramner
2017-12-25 19:39:57 UTC
Permalink
Raw Message
x264 | branch: master | Henrik Gramner <***@gramner.com> | Sun Oct 8 21:23:12 2017 +0200| [d93851ec282eb069f91a6eddab3284f7766cd5bd] | committer: Anton Mitrofanov

x86: AVX-512 load_deinterleave_chroma_fenc
http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d93851ec282eb069f91a6eddab3284f7766cd5bd
---

common/x86/mc-a2.asm | 30 +++++++++++++++++++++---------
common/x86/mc-c.c | 6 +++++-
2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index a4e11616..69ed4cd4 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1264,17 +1264,27 @@ cglobal load_deinterleave_chroma_fenc, 4,5
vbroadcasti128 m0, [deinterleave_shuf]
lea r4, [r2*3]
.loop:
- mova xm1, [r1]
- vinserti128 m1, m1, [r1+r2], 1
- mova xm2, [r1+r2*2]
- vinserti128 m2, m2, [r1+r4], 1
+ mova xm1, [r1] ; 0
+ vinserti128 ym1, [r1+r2], 1 ; 1
+%if mmsize == 64
+ mova xm2, [r1+r2*4] ; 4
+ vinserti32x4 m1, [r1+r2*2], 2 ; 2
+ vinserti32x4 m2, [r1+r4*2], 2 ; 6
+ vinserti32x4 m1, [r1+r4], 3 ; 3
+ lea r1, [r1+r2*4]
+ vinserti32x4 m2, [r1+r2], 1 ; 5
+ vinserti32x4 m2, [r1+r4], 3 ; 7
+%else
+ mova xm2, [r1+r2*2] ; 2
+ vinserti128 m2, [r1+r4], 1 ; 3
+%endif
+ lea r1, [r1+r2*4]
pshufb m1, m0
pshufb m2, m0
- mova [r0+0*FENC_STRIDE], m1
- mova [r0+2*FENC_STRIDE], m2
- lea r1, [r1+r2*4]
- add r0, 4*FENC_STRIDE
- sub r3d, 4
+ mova [r0], m1
+ mova [r0+mmsize], m2
+ add r0, 2*mmsize
+ sub r3d, mmsize/8
jg .loop
RET
%endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
@@ -1499,6 +1509,8 @@ PLANE_DEINTERLEAVE_RGB
INIT_YMM avx2
LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
PLANE_DEINTERLEAVE_RGB
+INIT_ZMM avx512
+LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
%endif

; These functions are not general-use; not only do they require aligned input, but memcpy
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 51764811..0deb1387 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -245,6 +245,8 @@ void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intpt
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fenc_avx2 x264_template(load_deinterleave_chroma_fenc_avx2)
void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height );
+#define x264_load_deinterleave_chroma_fenc_avx512 x264_template(load_deinterleave_chroma_fenc_avx512)
+void x264_load_deinterleave_chroma_fenc_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fdec_sse2 x264_template(load_deinterleave_chroma_fdec_sse2)
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fdec_ssse3 x264_template(load_deinterleave_chroma_fdec_ssse3)
@@ -909,6 +911,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma_avx2;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}

@@ -1068,6 +1071,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init4h = x264_integral_init4h_avx2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
}

if( cpu&X264_CPU_AVX512 )
@@ -1077,6 +1081,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512;
}
#endif // HIGH_BIT_DEPTH

@@ -1096,7 +1101,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_swap = plane_copy_swap_avx2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx2;
- pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_propagate_list = mbtree_propagate_list_avx2;

Loading...