Discussion:
[x264-devel] [PATCH 1/3] ppc: Use xxpermdi to halve the computation in sad_x4_8x8
Luca Barbato
2018-08-19 15:27:53 UTC
Permalink
About 20% faster.
---
common/ppc/pixel.c | 47 ++++++++++++++++++++++++-----------------------
1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index abb0f59b..a1b52ca3 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1073,56 +1073,57 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,

for( int y = 0; y < 4; y++ )
{
- pix0v = vec_vsx_ld(0, pix0);
+ vec_u8_t pix0vH = vec_vsx_ld(0, pix0);
pix0 += i_stride;

- pix1v = vec_vsx_ld(0, pix1);
+ vec_u8_t pix1vH = vec_vsx_ld(0, pix1);
pix1 += i_stride;

- fencv = vec_vsx_ld(0, fenc);
+ vec_u8_t fencvH = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;

- pix2v = vec_vsx_ld(0, pix2);
+ vec_u8_t pix2vH = vec_vsx_ld(0, pix2);
pix2 += i_stride;

- pix3v = vec_vsx_ld(0, pix3);
+ vec_u8_t pix3vH = vec_vsx_ld(0, pix3);
pix3 += i_stride;

- sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
- sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
- sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
- sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
-
- pix0v = vec_vsx_ld(0, pix0);
+ vec_u8_t pix0vL = vec_vsx_ld(0, pix0);
pix0 += i_stride;

- pix1v = vec_vsx_ld(0, pix1);
+ vec_u8_t pix1vL = vec_vsx_ld(0, pix1);
pix1 += i_stride;

- fencv = vec_vsx_ld(0, fenc);
+ vec_u8_t fencvL = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;

- pix2v = vec_vsx_ld(0, pix2);
+ vec_u8_t pix2vL = vec_vsx_ld(0, pix2);
pix2 += i_stride;

- pix3v = vec_vsx_ld(0, pix3);
+ vec_u8_t pix3vL = vec_vsx_ld(0, pix3);
pix3 += i_stride;

+ fencv = xxpermdi(fencvH, fencvL, 0);
+ pix0v = xxpermdi(pix0vH, pix0vL, 0);
+ pix1v = xxpermdi(pix1vH, pix1vL, 0);
+ pix2v = xxpermdi(pix2vH, pix2vL, 0);
+ pix3v = xxpermdi(pix3vH, pix3vL, 0);
+
sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
}

- sum0v = vec_sum2s( sum0v, zero_s32v );
- sum1v = vec_sum2s( sum1v, zero_s32v );
- sum2v = vec_sum2s( sum2v, zero_s32v );
- sum3v = vec_sum2s( sum3v, zero_s32v );
+ sum0v = vec_sums( sum0v, zero_s32v );
+ sum1v = vec_sums( sum1v, zero_s32v );
+ sum2v = vec_sums( sum2v, zero_s32v );
+ sum3v = vec_sums( sum3v, zero_s32v );

- sum0v = vec_splat( sum0v, 1 );
- sum1v = vec_splat( sum1v, 1 );
- sum2v = vec_splat( sum2v, 1 );
- sum3v = vec_splat( sum3v, 1 );
+ sum0v = vec_splat( sum0v, 3 );
+ sum1v = vec_splat( sum1v, 3 );
+ sum2v = vec_splat( sum2v, 3 );
+ sum3v = vec_splat( sum3v, 3 );

vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
--
2.12.2
Luca Barbato
2018-08-19 15:27:54 UTC
Permalink
Yet another use of xxpermdi, another 10% gain.
---
common/ppc/pixel.c | 17 ++++-------------
1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index a1b52ca3..0e733a0d 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1120,20 +1120,11 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
sum2v = vec_sums( sum2v, zero_s32v );
sum3v = vec_sums( sum3v, zero_s32v );

- sum0v = vec_splat( sum0v, 3 );
- sum1v = vec_splat( sum1v, 3 );
- sum2v = vec_splat( sum2v, 3 );
- sum3v = vec_splat( sum3v, 3 );
+ vec_s32_t s01 = vec_mergel(sum0v, sum1v);
+ vec_s32_t s23 = vec_mergel(sum2v, sum3v);
+ vec_s32_t s = xxpermdi(s01, s23, 3);

- vec_ste( sum0v, 0, &sum0);
- vec_ste( sum1v, 0, &sum1);
- vec_ste( sum2v, 0, &sum2);
- vec_ste( sum3v, 0, &sum3);
-
- scores[0] = sum0;
- scores[1] = sum1;
- scores[2] = sum2;
- scores[3] = sum3;
+ vec_vsx_st(s, 0, scores);
}

static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
--
2.12.2
Luca Barbato
2018-08-19 15:27:55 UTC
Permalink
Around a ~2% speedup to the overall encoding for --slow.
---
common/ppc/mc.c | 3 ---
common/ppc/ppccommon.h | 12 ++----------
common/ppc/predict.c | 2 --
3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 2faddfd9..3ceb1ac8 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -51,7 +51,6 @@ static inline void pixel_avg2_w8_altivec( uint8_t *dst, intptr_t i_dst,
uint8_t *src2, int i_height )
{
vec_u8_t src1v, src2v;
- PREP_STORE8;

for( int y = 0; y < i_height; y++ )
{
@@ -525,7 +524,6 @@ static void mc_chroma_8xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
srcp = &src[i_src_stride];

LOAD_ZERO;
- PREP_STORE8;
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
vec_u8_t dstuv, dstvv;
@@ -1098,7 +1096,6 @@ static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, in
const x264_weight_t *weight, int i_height )
{
LOAD_ZERO;
- PREP_STORE8;
vec_u8_t srcv;
vec_s16_t weightv;
vec_s16_t scalev, offsetv, denomv, roundv;
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index fd9d6a7d..311e12a2 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -146,18 +146,10 @@ typedef union {
#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )

/***********************************************************************
- * PREP_STORE##n: declares required vectors to store n bytes to a
- * potentially unaligned address
* VEC_STORE##n: stores n bytes from vector v to address p
**********************************************************************/
-#define PREP_STORE8 \
- vec_u8_t _tmp3v; \
- vec_u8_t mask = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
- 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F } \
-
-#define VEC_STORE8( v, p ) \
- _tmp3v = vec_vsx_ld( 0, p ); \
- v = vec_perm( v, _tmp3v, mask ); \
+#define VEC_STORE8( v, p ) \
+ v = vec_xxpermdi( v, vec_vsx_ld( 0, p ), 1 ); \
vec_vsx_st( v, 0, p )

/***********************************************************************
diff --git a/common/ppc/predict.c b/common/ppc/predict.c
index 324b4c75..0b6bae42 100644
--- a/common/ppc/predict.c
+++ b/common/ppc/predict.c
@@ -58,8 +58,6 @@ static void predict_8x8c_p_altivec( uint8_t *src )
vec_s16_t induc_v = (vec_s16_t) CV(0, 1, 2, 3, 4, 5, 6, 7);
vec_s16_t add_i0_b_0v = vec_mladd(induc_v, b_v, i00_v);

- PREP_STORE8;
-
for( int i = 0; i < 8; ++i )
{
vec_s16_t shift_0_v = vec_sra(add_i0_b_0v, val5_v);
--
2.12.2
Loading...