Discussion:
Make ref and i4x4_mode costs global instead of static
(too old to reply)
Anton Mitrofanov
2017-12-25 19:39:53 UTC
Permalink
x264 | branch: master | Anton Mitrofanov <***@narod.ru> | Fri Sep 22 17:18:55 2017 +0300| [bdf27e783a8eb4a5bcae0cd0a950d6dc3d995bfe] | committer: Anton Mitrofanov

Make ref and i4x4_mode costs global instead of static

Fixes some thread safety doubts and makes code cleaner.
Downside: slightly higher memory usage when calling multiple encoders from the same application.
http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=bdf27e783a8eb4a5bcae0cd0a950d6dc3d995bfe
---

common/common.h | 7 ++++++-
encoder/analyse.c | 16 +++++-----------
encoder/encoder.c | 2 ++
3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/common/common.h b/common/common.h
index 162cfb4d..fe2b1c7f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -343,9 +343,14 @@ struct x264_t
udctcoef (*quant8_bias0[4])[64]; /* [4][QP_MAX_SPEC+1][64] */
udctcoef (*nr_offset_emergency)[4][64];

- /* mv/ref cost arrays. */
+ /* mv/ref/mode cost arrays. */
uint16_t *cost_mv[QP_MAX+1];
uint16_t *cost_mv_fpel[QP_MAX+1][4];
+ struct
+ {
+ uint16_t ref[QP_MAX+1][3][33];
+ ALIGNED_64( uint16_t i4x4_mode[QP_MAX+1][32] );
+ } *cost_table;

const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */

diff --git a/encoder/analyse.c b/encoder/analyse.c
index b8b29d9b..a289b242 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -140,10 +140,6 @@ static const uint8_t i_sub_mb_p_cost_table[4] =

static void analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );

-static uint16_t x264_cost_ref[QP_MAX+1][3][33];
-static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
-static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
-
static int init_costs( x264_t *h, float *logs, int qp )
{
if( h->cost_mv[qp] )
@@ -159,11 +155,9 @@ static int init_costs( x264_t *h, float *logs, int qp )
h->cost_mv[qp][-i] =
h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX );
}
- x264_pthread_mutex_lock( &cost_ref_mutex );
for( int i = 0; i < 3; i++ )
for( int j = 0; j < 33; j++ )
- x264_cost_ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
- x264_pthread_mutex_unlock( &cost_ref_mutex );
+ h->cost_table->ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
{
for( int j = 0; j < 4; j++ )
@@ -174,7 +168,7 @@ static int init_costs( x264_t *h, float *logs, int qp )
h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
}
}
- uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + qp*32;
+ uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[qp];
for( int i = 0; i < 17; i++ )
cost_i4x4_mode[i] = 3*lambda*(i!=8);
return 0;
@@ -252,8 +246,8 @@ void x264_analyse_weight_frame( x264_t *h, int end )
static void mb_analyse_load_costs( x264_t *h, x264_mb_analysis_t *a )
{
a->p_cost_mv = h->cost_mv[a->i_qp];
- a->p_cost_ref[0] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
- a->p_cost_ref[1] = x264_cost_ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
+ a->p_cost_ref[0] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l0_active-1,0,2)];
+ a->p_cost_ref[1] = h->cost_table->ref[a->i_qp][x264_clip3(h->sh.i_num_ref_idx_l1_active-1,0,2)];
}

static void mb_analyse_init_qp( x264_t *h, x264_mb_analysis_t *a, int qp )
@@ -749,7 +743,7 @@ static void mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter
return;
}

- uint16_t *cost_i4x4_mode = (uint16_t*)ALIGN((intptr_t)x264_cost_i4x4_mode,64) + a->i_qp*32 + 8;
+ uint16_t *cost_i4x4_mode = h->cost_table->i4x4_mode[a->i_qp] + 8;
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 088f5411..ff18054d 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1527,6 +1527,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
h->frames.i_largest_pts = h->frames.i_second_largest_pts = -1;
h->frames.i_poc_last_open_gop = -1;

+ CHECKED_MALLOCZERO( h->cost_table, sizeof(*h->cost_table) );
CHECKED_MALLOCZERO( h->frames.unused[0], (h->frames.i_delay + 3) * sizeof(x264_frame_t *) );
/* Allocate room for max refs plus a few extra just in case. */
CHECKED_MALLOCZERO( h->frames.unused[1], (h->i_thread_frames + X264_REF_MAX + 4) * sizeof(x264_frame_t *) );
@@ -4364,6 +4365,7 @@ void x264_encoder_close ( x264_t *h )
x264_free( h->nal_buffer );
x264_free( h->reconfig_h );
x264_analyse_free_costs( h );
+ x264_free( h->cost_table );

if( h->i_thread_frames > 1 )
h = h->thread[h->i_thread_phase];

Loading...