[x264-devel] [PATCH] PPC: Improve SATD by using vec_extract.
Michail Alvanos
malvanos at gmail.com
Wed Jul 10 21:01:04 CEST 2019
Around 1-10% speedup of intra_satd_x3 and satd_ functions by
using vec_extract instead of vec_splat and vec_ste.
Microbenchmark results:
Power 8:
satd_4x4_altivec: 148 ==> satd_4x4_altivec: 87
satd_4x8_altivec: 186 ==> satd_4x8_altivec: 128
satd_8x4_altivec: 177 ==> satd_8x4_altivec: 114
satd_8x8_altivec: 188 ==> satd_8x8_altivec: 136
satd_8x16_altivec: 300 ==> satd_8x16_altivec: 262
satd_16x8_altivec: 269 ==> satd_16x8_altivec: 228
satd_16x16_altivec: 517 ==> satd_16x16_altivec: 485
intra_satd_x3_4x4_altivec: 528 ==> intra_satd_x3_4x4_altivec: 444
intra_satd_x3_8x8c_altivec: 679 ==> intra_satd_x3_8x8c_altivec: 593
intra_satd_x3_16x16_altivec: 1815 ==> intra_satd_x3_16x16_altivec: 1724
Power 9:
satd_4x4_altivec: 131 ==> satd_4x4_altivec: 113
satd_4x8_altivec: 175 ==> satd_4x8_altivec: 155
satd_8x4_altivec: 150 ==> satd_8x4_altivec: 135
satd_8x8_altivec: 174 ==> satd_8x8_altivec: 161
satd_8x16_altivec: 290 ==> satd_8x16_altivec: 277
satd_16x8_altivec: 272 ==> satd_16x8_altivec: 270
satd_16x16_altivec: 563 ==> satd_16x16_altivec: 566
intra_satd_x3_4x4_altivec: 424 ==> intra_satd_x3_4x4_altivec: 400
intra_satd_x3_8x8c_altivec: 687 ==> intra_satd_x3_8x8c_altivec: 616
intra_satd_x3_16x16_altivec: 2047 ==> intra_satd_x3_16x16_altivec: 2062
---
common/ppc/pixel.c | 27 ++++++++++-----------------
1 file changed, 10 insertions(+), 17 deletions(-)
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index f94029e3..59a74afc 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -161,9 +161,7 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
- satdv = vec_sum2s( satdv, zero_s32v );
- satdv = vec_splat( satdv, 1 );
- vec_ste( satdv, 0, &i_satd );
+ i_satd = vec_extract(satdv,0) + vec_extract(satdv,1);
return i_satd >> 1;
}
@@ -207,9 +205,7 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
satdv = vec_add( satdv, add_abs_4( temp0v, temp1v, temp2v, temp3v ) );
- satdv = vec_sum2s( satdv, zero_s32v );
- satdv = vec_splat( satdv, 1 );
- vec_ste( satdv, 0, &i_satd );
+ i_satd = vec_extract(satdv,0) + vec_extract(satdv,1);
return i_satd >> 1;
}
@@ -280,9 +276,7 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v );
- satdv = vec_sum2s( satdv, zero_s32v );
- satdv = vec_splat( satdv, 1 );
- vec_ste( satdv, 0, &i_satd );
+ i_satd = vec_extract(satdv,0) + vec_extract(satdv,1);
return i_satd >> 1;
}
@@ -330,8 +324,8 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
@@ -400,8 +394,8 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v ) );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
@@ -470,8 +464,8 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v ) );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
@@ -575,8 +569,7 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
temp4v, temp5v, temp6v, temp7v ) );
satdv = vec_sums( satdv, zero_s32v );
- satdv = vec_splat( satdv, 3 );
- vec_ste( satdv, 0, &i_satd );
+ i_satd = vec_extract(satdv,3) ;
return i_satd >> 1;
}
--
2.17.1
More information about the x264-devel
mailing list