[x264-devel] [PATCH 1/6] ppc: Manually unroll the horizontal prediction loop
Luca Barbato
lu_zero at gentoo.org
Tue Nov 1 23:16:13 CET 2016
Doubles the speedup from the function (from being slower to be over
twice as fast than C).
---
common/ppc/predict.c | 67 +++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 59 insertions(+), 8 deletions(-)
diff --git a/common/ppc/predict.c b/common/ppc/predict.c
index f35e180..f25f741 100644
--- a/common/ppc/predict.c
+++ b/common/ppc/predict.c
@@ -176,14 +176,65 @@ static void predict_16x16_dc_128_altivec( uint8_t *src )
static void predict_16x16_h_altivec( uint8_t *src )
{
- for( int i = 0; i < 16; i++ )
- {
- vec_u8_t v = vec_ld(-1, src);
- vec_u8_t v_v = vec_splat(v, 15);
- vec_st(v_v, 0, src);
-
- src += FDEC_STRIDE;
- }
+ vec_u8_t v1 = vec_ld(-1, src);
+ vec_u8_t v2 = vec_ld(-1, src + FDEC_STRIDE);
+ vec_u8_t v3 = vec_ld(-1, src + FDEC_STRIDE * 2);
+ vec_u8_t v4 = vec_ld(-1, src + FDEC_STRIDE * 3);
+
+ vec_u8_t v5 = vec_ld(-1, src + FDEC_STRIDE * 4);
+ vec_u8_t v6 = vec_ld(-1, src + FDEC_STRIDE * 5);
+ vec_u8_t v7 = vec_ld(-1, src + FDEC_STRIDE * 6);
+ vec_u8_t v8 = vec_ld(-1, src + FDEC_STRIDE * 7);
+
+ vec_u8_t v9 = vec_ld(-1, src + FDEC_STRIDE * 8);
+ vec_u8_t vA = vec_ld(-1, src + FDEC_STRIDE * 9);
+ vec_u8_t vB = vec_ld(-1, src + FDEC_STRIDE * 10);
+ vec_u8_t vC = vec_ld(-1, src + FDEC_STRIDE * 11);
+
+ vec_u8_t vD = vec_ld(-1, src + FDEC_STRIDE * 12);
+ vec_u8_t vE = vec_ld(-1, src + FDEC_STRIDE * 13);
+ vec_u8_t vF = vec_ld(-1, src + FDEC_STRIDE * 14);
+ vec_u8_t vG = vec_ld(-1, src + FDEC_STRIDE * 15);
+
+ vec_u8_t v_v1 = vec_splat(v1, 15);
+ vec_u8_t v_v2 = vec_splat(v2, 15);
+ vec_u8_t v_v3 = vec_splat(v3, 15);
+ vec_u8_t v_v4 = vec_splat(v4, 15);
+
+ vec_u8_t v_v5 = vec_splat(v5, 15);
+ vec_u8_t v_v6 = vec_splat(v6, 15);
+ vec_u8_t v_v7 = vec_splat(v7, 15);
+ vec_u8_t v_v8 = vec_splat(v8, 15);
+
+ vec_u8_t v_v9 = vec_splat(v9, 15);
+ vec_u8_t v_vA = vec_splat(vA, 15);
+ vec_u8_t v_vB = vec_splat(vB, 15);
+ vec_u8_t v_vC = vec_splat(vC, 15);
+
+ vec_u8_t v_vD = vec_splat(vD, 15);
+ vec_u8_t v_vE = vec_splat(vE, 15);
+ vec_u8_t v_vF = vec_splat(vF, 15);
+ vec_u8_t v_vG = vec_splat(vG, 15);
+
+ vec_st(v_v1, 0, src);
+ vec_st(v_v2, 0, src + FDEC_STRIDE);
+ vec_st(v_v3, 0, src + FDEC_STRIDE * 2);
+ vec_st(v_v4, 0, src + FDEC_STRIDE * 3);
+
+ vec_st(v_v5, 0, src + FDEC_STRIDE * 4);
+ vec_st(v_v6, 0, src + FDEC_STRIDE * 5);
+ vec_st(v_v7, 0, src + FDEC_STRIDE * 6);
+ vec_st(v_v8, 0, src + FDEC_STRIDE * 7);
+
+ vec_st(v_v9, 0, src + FDEC_STRIDE * 8);
+ vec_st(v_vA, 0, src + FDEC_STRIDE * 9);
+ vec_st(v_vB, 0, src + FDEC_STRIDE * 10);
+ vec_st(v_vC, 0, src + FDEC_STRIDE * 11);
+
+ vec_st(v_vD, 0, src + FDEC_STRIDE * 12);
+ vec_st(v_vE, 0, src + FDEC_STRIDE * 13);
+ vec_st(v_vF, 0, src + FDEC_STRIDE * 14);
+ vec_st(v_vG, 0, src + FDEC_STRIDE * 15);
}
static void predict_16x16_v_altivec( uint8_t *src )
--
2.9.2
More information about the x264-devel
mailing list