[x264-devel] [PATCH 2/3] PPC: Improve SAD_x3 and SAD_x4 by using vec_extract

Michail Alvanos malvanos at gmail.com
Sun Apr 14 12:45:10 CEST 2019


Improve sad_x3/x4 functions by using vec_extract
instead of vec_splat and vec_ste.

Power 9:
sad_x3_8x8_altivec: 142 --> sad_x3_8x8_altivec: 125
sad_x3_8x16_altivec: 214 --> sad_x3_8x16_altivec: 198
sad_x3_16x8_altivec: 158 --> sad_x3_16x8_altivec: 141
sad_x3_16x16_altivec: 269 --> sad_x3_16x16_altivec: 240
sad_x4_8x8_altivec: 148 --> sad_x4_8x8_altivec: 147
sad_x4_8x16_altivec: 233 --> sad_x4_8x16_altivec: 233
sad_x4_16x8_altivec: 183 --> sad_x4_16x8_altivec: 164
sad_x4_16x16_altivec: 300 --> sad_x4_16x16_altivec: 272

Power8:
sad_x3_8x8_altivec: 234 --> sad_x3_8x8_altivec: 179
sad_x3_8x16_altivec: 331 --> sad_x3_8x16_altivec: 277
sad_x3_16x8_altivec: 252 --> sad_x3_16x8_altivec: 203
sad_x3_16x16_altivec: 405 --> sad_x3_16x16_altivec: 328
sad_x4_8x8_altivec: 230 --> sad_x4_8x8_altivec: 239
sad_x4_8x16_altivec: 345 --> sad_x4_8x16_altivec: 344
sad_x4_16x8_altivec: 320 --> sad_x4_16x8_altivec: 272
sad_x4_16x16_altivec: 515 --> sad_x4_16x16_altivec: 437

---
 common/ppc/pixel.c | 100 ++++++++-------------------------------------
 1 file changed, 17 insertions(+), 83 deletions(-)

diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 32968b42..d1a14ca9 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -583,11 +583,6 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
                                         uint8_t *pix2, uint8_t *pix3,
                                         intptr_t i_stride, int scores[4] )
 {
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-    ALIGNED_16( int sum3 );
-
     LOAD_ZERO;
     vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
     vec_s32_t sum0v, sum1v, sum2v, sum3v;
@@ -646,30 +641,16 @@ static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
     sum2v = vec_sums( sum2v, zero_s32v );
     sum3v = vec_sums( sum3v, zero_s32v );
 
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-    sum3v = vec_splat( sum3v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-    vec_ste( sum3v, 0, &sum3);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-    scores[3] = sum3;
+    scores[0] = vec_extract(sum0v,3) ;
+    scores[1] = vec_extract(sum1v,3) ;
+    scores[2] = vec_extract(sum2v,3) ;
+    scores[3] = vec_extract(sum3v,3) ;
 }
 
 static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
                                         uint8_t *pix1, uint8_t *pix2,
                                         intptr_t i_stride, int scores[3] )
 {
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-
     LOAD_ZERO;
     vec_u8_t fencv, pix0v, pix1v, pix2v;
     vec_s32_t sum0v, sum1v, sum2v;
@@ -717,27 +698,14 @@ static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
     sum1v = vec_sums( sum1v, zero_s32v );
     sum2v = vec_sums( sum2v, zero_s32v );
 
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
+    scores[0] = vec_extract(sum0v,3) ;
+    scores[1] = vec_extract(sum1v,3) ;
+    scores[2] = vec_extract(sum2v,3) ;
 }
 
 static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,
                                        uint8_t *pix3, intptr_t i_stride, int scores[4] )
 {
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-    ALIGNED_16( int sum3 );
-
     LOAD_ZERO;
     vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
     vec_s32_t sum0v, sum1v, sum2v, sum3v;
@@ -795,20 +763,10 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi
     sum2v = vec_sums( sum2v, zero_s32v );
     sum3v = vec_sums( sum3v, zero_s32v );
 
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-    sum3v = vec_splat( sum3v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-    vec_ste( sum3v, 0, &sum3);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
-    scores[3] = sum3;
+    scores[0] = vec_extract(sum0v,3) ;
+    scores[1] = vec_extract(sum1v,3) ;
+    scores[2] = vec_extract(sum2v,3) ;
+    scores[3] = vec_extract(sum3v,3) ;
 }
 
 #define PROCESS_PIXS                                                                  \
@@ -850,10 +808,6 @@ static int name( uint8_t *fenc, uint8_t *pix0,      \
                  uint8_t *pix1, uint8_t *pix2,      \
                  intptr_t i_stride, int scores[3] ) \
 {                                                   \
-    ALIGNED_16( int sum0 );                         \
-    ALIGNED_16( int sum1 );                         \
-    ALIGNED_16( int sum2 );                         \
-                                                    \
     LOAD_ZERO;                                      \
     vec_u8_t fencv, pix0v, pix1v, pix2v;            \
     vec_s32_t sum0v, sum1v, sum2v;                  \
@@ -871,17 +825,9 @@ static int name( uint8_t *fenc, uint8_t *pix0,      \
     sum1v = vec_sums( sum1v, zero_s32v );           \
     sum2v = vec_sums( sum2v, zero_s32v );           \
                                                     \
-    sum0v = vec_splat( sum0v, 3 );                  \
-    sum1v = vec_splat( sum1v, 3 );                  \
-    sum2v = vec_splat( sum2v, 3 );                  \
-                                                    \
-    vec_ste( sum0v, 0, &sum0 );                     \
-    vec_ste( sum1v, 0, &sum1 );                     \
-    vec_ste( sum2v, 0, &sum2 );                     \
-                                                    \
-    scores[0] = sum0;                               \
-    scores[1] = sum1;                               \
-    scores[2] = sum2;                               \
+    scores[0] = vec_extract(sum0v,3);               \
+    scores[1] = vec_extract(sum1v,3);               \
+    scores[2] = vec_extract(sum2v,3);               \
 }
 
 PIXEL_SAD_X3_ALTIVEC( pixel_sad_x3_8x8_altivec, 4 )
@@ -891,10 +837,6 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
                                        uint8_t *pix1, uint8_t *pix2,
                                        intptr_t i_stride, int scores[3] )
 {
-    ALIGNED_16( int sum0 );
-    ALIGNED_16( int sum1 );
-    ALIGNED_16( int sum2 );
-
     LOAD_ZERO;
     vec_u8_t fencv, pix0v, pix1v, pix2v;
     vec_s32_t sum0v, sum1v, sum2v;
@@ -942,17 +884,9 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
     sum1v = vec_sums( sum1v, zero_s32v );
     sum2v = vec_sums( sum2v, zero_s32v );
 
-    sum0v = vec_splat( sum0v, 3 );
-    sum1v = vec_splat( sum1v, 3 );
-    sum2v = vec_splat( sum2v, 3 );
-
-    vec_ste( sum0v, 0, &sum0);
-    vec_ste( sum1v, 0, &sum1);
-    vec_ste( sum2v, 0, &sum2);
-
-    scores[0] = sum0;
-    scores[1] = sum1;
-    scores[2] = sum2;
+    scores[0] = vec_extract(sum0v,3);
+    scores[1] = vec_extract(sum1v,3);
+    scores[2] = vec_extract(sum2v,3);
 }
 
 #define PIXEL_SAD_X4_ALTIVEC( name, ly )                                              \
-- 
2.17.1



More information about the x264-devel mailing list