Discussion:
Merge zero buffers
(too old to reply)
Henrik Gramner
2018-01-18 20:22:57 UTC
Permalink
Raw Message
x264 | branch: master | Henrik Gramner <***@gramner.com> | Tue Jan 16 17:43:24 2018 +0100| [b019515ef4ad77022b849283c62612157e8458a7] | committer: Henrik Gramner

Merge zero buffers

Improves cache efficiency.
http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b019515ef4ad77022b849283c62612157e8458a7
---

common/mc.c | 2 +-
common/mc.h | 3 +--
common/tables.c | 3 +++
common/tables.h | 2 ++
encoder/analyse.c | 6 ++----
encoder/me.c | 3 +--
encoder/rdo.c | 10 ++++------
7 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/common/mc.c b/common/mc.c
index 38875b9a..d9e4e233 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -155,7 +155,7 @@ static weight_fn_t mc_weight_wtab[6] =
mc_weight_w16,
mc_weight_w20,
};
-const x264_weight_t x264_weight_none[3] = { {{0}} };
+
static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
diff --git a/common/mc.h b/common/mc.h
index 8a33929c..8ec00750 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -244,8 +244,7 @@ typedef struct x264_weight_t
weight_fn_t *weightfn;
} ALIGNED_16( x264_weight_t );

-#define x264_weight_none x264_template(weight_none)
-extern const x264_weight_t x264_weight_none[3];
+#define x264_weight_none ((const x264_weight_t*)x264_zero)

#define SET_WEIGHT( w, b, s, d, o )\
{\
diff --git a/common/tables.c b/common/tables.c
index 79d99517..0edcfcc1 100644
--- a/common/tables.c
+++ b/common/tables.c
@@ -2534,3 +2534,6 @@ const vlc_t x264_run_before_init[7][16] =
{ 0x1, 11 }, /* str=00000000001 */
},
};
+
+/* psy_trellis_init() has the largest size requirement of 16*FDEC_STRIDE*sizeof(pixel) */
+ALIGNED_64( uint8_t x264_zero[1024] ) = { 0 };
diff --git a/common/tables.h b/common/tables.h
index 6ca74eae..88248ae8 100644
--- a/common/tables.h
+++ b/common/tables.h
@@ -94,4 +94,6 @@ extern const vlc_t x264_total_zeros_2x2_dc[3][4];
extern const vlc_t x264_total_zeros_2x4_dc[7][8];
extern const vlc_t x264_run_before_init[7][16];

+extern uint8_t x264_zero[1024];
+
#endif
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 3577e5bc..ebae8a81 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -558,12 +558,10 @@ static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra,
/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
static void inline psy_trellis_init( x264_t *h, int do_both_dct )
{
- ALIGNED_64( static pixel zero[16*FDEC_STRIDE] ) = {0};
-
if( do_both_dct || h->mb.b_transform_8x8 )
- h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
+ h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
if( do_both_dct || !h->mb.b_transform_8x8 )
- h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
+ h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
}

/* Reset fenc satd scores cache for psy RD */
diff --git a/encoder/me.c b/encoder/me.c
index fbb694b6..79bd4062 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -633,7 +633,6 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
uint16_t *sums_base = m->integral;
- ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
@@ -641,7 +640,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
int xn;
uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);

- h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
+ h->pixf.sad_x4[sad_size]( (pixel*)x264_zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
FENC_STRIDE, enc_dc );
if( delta == 4 )
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 41a5dddb..d884ee63 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -96,7 +96,6 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
static const uint8_t satd_shift_x[3] = {3, 2, 2};
static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
static const uint8_t satd_offset[3] = {0, 8, 16};
- ALIGNED_16( static pixel zero[16] ) = {0};
int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4])
+ satd_offset[size - PIXEL_8x4];
int res = h->mb.pic.fenc_satd_cache[cache_index];
@@ -105,8 +104,8 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
else
{
pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
- int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
- res = h->pixf.satd[size]( fenc, FENC_STRIDE, zero, 0 ) - dc;
+ int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) >> 1;
+ res = h->pixf.satd[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) - dc;
h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
return res;
}
@@ -123,7 +122,6 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )

static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
{
- ALIGNED_16( static pixel zero[16] ) = {0};
int satd = 0;
pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
@@ -140,8 +138,8 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
}
else
{
- int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
- satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
+ int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) >> 1;
+ satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) - dc - cached_satd( h, size, x, y ));
}
satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
}

Loading...