[x265] [PATCH 2 of 2] asm_arm: redesign algorithm and rewrite interp_8tap_vert_pp_4xN

Min Chen chenm003 at 163.com
Wed May 18 18:42:35 CEST 2016


# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1463589741 18000
# Node ID 1fbcfda38731342670911c738342d6e57f75467c
# Parent  46c45f236ab0b25ec92a892f12315024eae2a11d
asm_arm: redesign algorithm and rewrite interp_8tap_vert_pp_4xN
Origin:
luma_vpp[  4x4]		1.87x 	 45.23    	 84.41
luma_vpp[  4x8]		2.10x 	 70.36    	 147.78
luma_vpp[ 4x16]		2.25x 	 121.24   	 272.18

New:
luma_vpp[  4x4]		3.10x 	 27.47    	 85.05
luma_vpp[  4x8]		4.59x 	 32.21    	 147.76
luma_vpp[ 4x16]		6.38x 	 42.73    	 272.48
---
 source/common/arm/ipfilter8.S |  157 +++++++++++++++++++++-------------------
 1 files changed, 82 insertions(+), 75 deletions(-)

diff -r 46c45f236ab0 -r 1fbcfda38731 source/common/arm/ipfilter8.S
--- a/source/common/arm/ipfilter8.S	Wed May 18 11:42:18 2016 -0500
+++ b/source/common/arm/ipfilter8.S	Wed May 18 11:42:21 2016 -0500
@@ -3,6 +3,7 @@
  *
  * Authors: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
  *          Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
+ *          Min Chen <min.chen at multicorewareinc.com>
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -42,6 +43,7 @@
 .word -2, -2, 16, 16, 54, 54, -4 ,-4
 .word -2, -2, 10, 10, 58, 58, -2, -2
 
+
 .text
 
 // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
@@ -709,85 +711,90 @@
 endfunc
 
 //**************luma_vpp************
+.align 8
+// TODO: I don't like S16 in here, but the VMUL with scalar doesn't support (U8 x U8)
+g_luma_s16:
+.hword   0, 0,   0, 64,  0,   0, 0,  0
+.hword  -1, 4, -10, 58, 17,  -5, 1,  0
+.hword  -1, 4, -11, 40, 40, -11, 4, -1
+.hword   0, 1,  -5, 17, 58, -10, 4, -1
+
 .macro LUMA_VPP_4xN h
 function x265_interp_8tap_vert_pp_4x\h\()_neon
-    push           {r4, r5, lr}
-    ldr             r4, [sp, #4 * 3]
-    mov             r5, r4, lsl #6
-    mov             r4, r1, lsl #2
-    sub             r4, r1
-    sub             r0, r4
+    ldr         r12, [sp]
+    push        {lr}
+    adr         lr, g_luma_s16
+    sub         r0, r1
+    sub         r0, r0, r1, lsl #1          // src -= 3 * srcStride
+    add         lr, lr, r12, lsl #4
+    vld1.16     {q0}, [lr, :64]             // q8 = luma interpolate coeff
+    vdup.s16    d24, d0[0]
+    vdup.s16    d25, d0[1]
+    vdup.s16    d26, d0[2]
+    vdup.s16    d27, d0[3]
+    vdup.s16    d28, d1[0]
+    vdup.s16    d29, d1[1]
+    vdup.s16    d30, d1[2]
+    vdup.s16    d31, d1[3]
 
-    mov             r4, #32
-    vdup.32         q8, r4
-    mov             r4, #\h
+    mov         r12, #\h
+
+    // prepare to load 8 lines
+    vld1.u32    {d0[0]}, [r0], r1
+    vld1.u32    {d0[1]}, [r0], r1
+    vld1.u32    {d2[0]}, [r0], r1
+    vld1.u32    {d2[1]}, [r0], r1
+    vld1.u32    {d4[0]}, [r0], r1
+    vld1.u32    {d4[1]}, [r0], r1
+    vld1.u32    {d6[0]}, [r0], r1
+    vld1.u32    {d6[1]}, [r0], r1
+    vmovl.u8    q0, d0
+    vmovl.u8    q1, d2
+    vmovl.u8    q2, d4
+    vmovl.u8    q3, d6
 
 .loop_4x\h:
-    movrel          r12, g_lumaFilter
-    add             r12, r5
-    mov             lr, r0
+    // TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
+    vld1.u32    {d16[0]}, [r0], r1
+    vld1.u32    {d16[1]}, [r0], r1
+    vmovl.u8    q8, d16
 
-    vld1.u32        d0[0], [lr], r1
-    vld1.u32        d0[1], [lr], r1
-    vld1.u32        d1[0], [lr], r1
-    vld1.u32        d1[1], [lr], r1
-    vld1.u32        d2[0], [lr], r1
-    vld1.u32        d2[1], [lr], r1
-    vld1.u32        d3[0], [lr], r1
-    vld1.u32        d3[1], [lr], r1
+    // row[0-1]
+    vmul.s16    q9, q0, q12
+    vext.64     q11, q0, q1, 1
+    vmul.s16    q10, q11, q12
+    vmov        q0, q1
 
-    veor.u8         q9, q9
+    // row[2-3]
+    vmla.s16    q9, q1, q13
+    vext.64     q11, q1, q2, 1
+    vmla.s16    q10, q11, q13
+    vmov        q1, q2
 
-    vmovl.u8        q11, d0
-    vmovl.u16       q12, d22
-    vmovl.u16       q13, d23
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q12, q10
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q13, q10
+    // row[4-5]
+    vmla.s16    q9, q2, q14
+    vext.64     q11, q2, q3, 1
+    vmla.s16    q10, q11, q14
+    vmov        q2, q3
 
-    vmovl.u8        q11, d1
-    vmovl.u16       q12, d22
-    vmovl.u16       q13, d23
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q12, q10
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q13, q10
+    // row[6-7]
+    vmla.s16    q9, q3, q15
+    vext.64     q11, q3, q8, 1
+    vmla.s16    q10, q11, q15
+    vmov        q3, q8
 
-    vmovl.u8        q11, d2
-    vmovl.u16       q12, d22
-    vmovl.u16       q13, d23
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q12, q10
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q13, q10
+    // sum row[0-7]
+    vadd.s16    d18, d18, d19
+    vadd.s16    d19, d20, d21
 
-    vmovl.u8        q11, d3
-    vmovl.u16       q12, d22
-    vmovl.u16       q13, d23
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q12, q10
-    vld1.s32        d20, [r12]!
-    vmov.s32        d21, d20
-    vmla.s32        q9, q13, q10
+    vqrshrun.s16 d18, q9, #6
+    vst1.u32    {d18[0]}, [r2], r3
+    vst1.u32    {d18[1]}, [r2], r3
 
-    vadd.s32        q9, q8
-    vqshrun.s32     d0, q9, #6
-    vqmovn.u16      d0, q0
-    vst1.u32        d0[0], [r2], r3
+    subs        r12, #2
+    bne        .loop_4x4
 
-    add             r0, r1
-    subs            r4, #1
-    bne             .loop_4x\h
-
-    pop             {r4, r5, pc}
+    pop         {pc}
     .ltorg
 endfunc
 .endm



More information about the x265-devel mailing list