[x264-devel] [PATCH 3/3] PPC: Improve SSD functions by using vec_extract

Michail Alvanos malvanos at gmail.com
Sun Apr 14 12:45:11 CEST 2019


Improve SSD functions by using vec_extract
instead of vec_splat and vec_ste.

Power9:
ssd_8x8_altivec: 105 --> ssd_8x8_altivec: 95
ssd_16x16_altivec: 158 --> ssd_16x16_altivec: 146

Power8:
ssd_8x8_altivec: 109 --> ssd_8x8_altivec: 88
ssd_16x16_altivec: 225 --> ssd_16x16_altivec: 196

---
 common/ppc/pixel.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index d1a14ca9..995aca3a 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -940,8 +940,6 @@ PIXEL_SAD_X4_ALTIVEC( pixel_sad_x4_8x16_altivec, 8 )
 static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
                                     uint8_t *pix2, intptr_t i_stride_pix2 )
 {
-    ALIGNED_16( int sum );
-
     LOAD_ZERO;
     vec_u8_t  pix1vA, pix2vA, pix1vB, pix2vB;
     vec_u32_t sumv;
@@ -986,16 +984,12 @@ static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
     sumv = vec_msum(diffB, diffB, sumv);
 
     sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
-    sumv = vec_splat(sumv, 3);
-    vec_ste((vec_s32_t) sumv, 0, &sum);
-    return sum;
+    return vec_extract(sumv,3);
 }
 
 static int pixel_ssd_8x8_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
                                   uint8_t *pix2, intptr_t i_stride_pix2 )
 {
-    ALIGNED_16( int sum );
-
     LOAD_ZERO;
     vec_u8_t  pix1v, pix2v;
     vec_u32_t sumv;
@@ -1020,10 +1014,8 @@ static int pixel_ssd_8x8_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
     sumv = vec_sel( zero_u32v, sumv, sel );
 
     sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
-    sumv = vec_splat(sumv, 3);
-    vec_ste((vec_s32_t) sumv, 0, &sum);
 
-    return sum;
+    return vec_extract(sumv,3);
 }
 
 
-- 
2.17.1



More information about the x264-devel mailing list