Luca Barbato
2018-08-19 15:27:53 UTC
About 20% faster.
---
common/ppc/pixel.c | 47 ++++++++++++++++++++++++-----------------------
1 file changed, 24 insertions(+), 23 deletions(-)
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index abb0f59b..a1b52ca3 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1073,56 +1073,57 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
for( int y = 0; y < 4; y++ )
{
- pix0v = vec_vsx_ld(0, pix0);
+ vec_u8_t pix0vH = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- pix1v = vec_vsx_ld(0, pix1);
+ vec_u8_t pix1vH = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- fencv = vec_vsx_ld(0, fenc);
+ vec_u8_t fencvH = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- pix2v = vec_vsx_ld(0, pix2);
+ vec_u8_t pix2vH = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- pix3v = vec_vsx_ld(0, pix3);
+ vec_u8_t pix3vH = vec_vsx_ld(0, pix3);
pix3 += i_stride;
- sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
- sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
- sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
- sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
-
- pix0v = vec_vsx_ld(0, pix0);
+ vec_u8_t pix0vL = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- pix1v = vec_vsx_ld(0, pix1);
+ vec_u8_t pix1vL = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- fencv = vec_vsx_ld(0, fenc);
+ vec_u8_t fencvL = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- pix2v = vec_vsx_ld(0, pix2);
+ vec_u8_t pix2vL = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- pix3v = vec_vsx_ld(0, pix3);
+ vec_u8_t pix3vL = vec_vsx_ld(0, pix3);
pix3 += i_stride;
+ fencv = xxpermdi(fencvH, fencvL, 0);
+ pix0v = xxpermdi(pix0vH, pix0vL, 0);
+ pix1v = xxpermdi(pix1vH, pix1vL, 0);
+ pix2v = xxpermdi(pix2vH, pix2vL, 0);
+ pix3v = xxpermdi(pix3vH, pix3vL, 0);
+
sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
}
- sum0v = vec_sum2s( sum0v, zero_s32v );
- sum1v = vec_sum2s( sum1v, zero_s32v );
- sum2v = vec_sum2s( sum2v, zero_s32v );
- sum3v = vec_sum2s( sum3v, zero_s32v );
+ sum0v = vec_sums( sum0v, zero_s32v );
+ sum1v = vec_sums( sum1v, zero_s32v );
+ sum2v = vec_sums( sum2v, zero_s32v );
+ sum3v = vec_sums( sum3v, zero_s32v );
- sum0v = vec_splat( sum0v, 1 );
- sum1v = vec_splat( sum1v, 1 );
- sum2v = vec_splat( sum2v, 1 );
- sum3v = vec_splat( sum3v, 1 );
+ sum0v = vec_splat( sum0v, 3 );
+ sum1v = vec_splat( sum1v, 3 );
+ sum2v = vec_splat( sum2v, 3 );
+ sum3v = vec_splat( sum3v, 3 );
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
---
common/ppc/pixel.c | 47 ++++++++++++++++++++++++-----------------------
1 file changed, 24 insertions(+), 23 deletions(-)
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index abb0f59b..a1b52ca3 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1073,56 +1073,57 @@ static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
for( int y = 0; y < 4; y++ )
{
- pix0v = vec_vsx_ld(0, pix0);
+ vec_u8_t pix0vH = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- pix1v = vec_vsx_ld(0, pix1);
+ vec_u8_t pix1vH = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- fencv = vec_vsx_ld(0, fenc);
+ vec_u8_t fencvH = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- pix2v = vec_vsx_ld(0, pix2);
+ vec_u8_t pix2vH = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- pix3v = vec_vsx_ld(0, pix3);
+ vec_u8_t pix3vH = vec_vsx_ld(0, pix3);
pix3 += i_stride;
- sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
- sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
- sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
- sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
-
- pix0v = vec_vsx_ld(0, pix0);
+ vec_u8_t pix0vL = vec_vsx_ld(0, pix0);
pix0 += i_stride;
- pix1v = vec_vsx_ld(0, pix1);
+ vec_u8_t pix1vL = vec_vsx_ld(0, pix1);
pix1 += i_stride;
- fencv = vec_vsx_ld(0, fenc);
+ vec_u8_t fencvL = vec_vsx_ld(0, fenc);
fenc += FENC_STRIDE;
- pix2v = vec_vsx_ld(0, pix2);
+ vec_u8_t pix2vL = vec_vsx_ld(0, pix2);
pix2 += i_stride;
- pix3v = vec_vsx_ld(0, pix3);
+ vec_u8_t pix3vL = vec_vsx_ld(0, pix3);
pix3 += i_stride;
+ fencv = xxpermdi(fencvH, fencvL, 0);
+ pix0v = xxpermdi(pix0vH, pix0vL, 0);
+ pix1v = xxpermdi(pix1vH, pix1vL, 0);
+ pix2v = xxpermdi(pix2vH, pix2vL, 0);
+ pix3v = xxpermdi(pix3vH, pix3vL, 0);
+
sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
}
- sum0v = vec_sum2s( sum0v, zero_s32v );
- sum1v = vec_sum2s( sum1v, zero_s32v );
- sum2v = vec_sum2s( sum2v, zero_s32v );
- sum3v = vec_sum2s( sum3v, zero_s32v );
+ sum0v = vec_sums( sum0v, zero_s32v );
+ sum1v = vec_sums( sum1v, zero_s32v );
+ sum2v = vec_sums( sum2v, zero_s32v );
+ sum3v = vec_sums( sum3v, zero_s32v );
- sum0v = vec_splat( sum0v, 1 );
- sum1v = vec_splat( sum1v, 1 );
- sum2v = vec_splat( sum2v, 1 );
- sum3v = vec_splat( sum3v, 1 );
+ sum0v = vec_splat( sum0v, 3 );
+ sum1v = vec_splat( sum1v, 3 );
+ sum2v = vec_splat( sum2v, 3 );
+ sum3v = vec_splat( sum3v, 3 );
vec_ste( sum0v, 0, &sum0);
vec_ste( sum1v, 0, &sum1);
--
2.12.2
2.12.2