[x265] [PATCH 1 of 2] asm_arm: improve interp_8tap_vert_pp_4xN By: 1. remove unnecessary cache prefetch instructions pld 2. replace register r6 by lr

Min Chen chenm003 at 163.com
Wed May 18 18:41:09 CEST 2016


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1463589653 18000
# Node ID d6990d957a9958a0b128b8a6d5c6a4954af99bbd
# Parent  28cf9adfc82e3816189b26aaeb907393b2a82ed8
asm_arm: improve interp_8tap_vert_pp_4xN By: 1. remove unnecessary cache prefetch instructions pld 2. replace register r6 by lr
Origin:
luma_vpp[  4x4]		1.87x 	 45.23    	 84.41
luma_vpp[  4x8]		2.10x 	 70.36    	 147.78
luma_vpp[ 4x16]		2.25x 	 121.24   	 272.18

Optimized:
luma_vpp[  4x4]		1.98x 	 42.42    	 84.02
luma_vpp[  4x8]		2.32x 	 63.70    	 147.49
luma_vpp[ 4x16]		2.51x 	 108.39   	 272.18
---
 source/common/arm/ipfilter8.S |   31 +++++++++++--------------------
 1 files changed, 11 insertions(+), 20 deletions(-)

diff -r 28cf9adfc82e -r d6990d957a99 source/common/arm/ipfilter8.S
--- a/source/common/arm/ipfilter8.S	Wed May 18 02:01:34 2016 +0000
+++ b/source/common/arm/ipfilter8.S	Wed May 18 11:40:53 2016 -0500
@@ -711,7 +711,7 @@
 //**************luma_vpp************
 .macro LUMA_VPP_4xN h
 function x265_interp_8tap_vert_pp_4x\h\()_neon
-    push           {r4, r5, r6}
+    push           {r4, r5, lr}
     ldr             r4, [sp, #4 * 3]
     mov             r5, r4, lsl #6
     mov             r4, r1, lsl #2
@@ -725,24 +725,16 @@
 .loop_4x\h:
     movrel          r12, g_lumaFilter
     add             r12, r5
-    mov             r6, r0
+    mov             lr, r0
 
-    pld [r6]
-    vld1.u32        d0[0], [r6], r1
-    pld [r6]
-    vld1.u32        d0[1], [r6], r1
-    pld [r6]
-    vld1.u32        d1[0], [r6], r1
-    pld [r6]
-    vld1.u32        d1[1], [r6], r1
-    pld [r6]
-    vld1.u32        d2[0], [r6], r1
-    pld [r6]
-    vld1.u32        d2[1], [r6], r1
-    pld [r6]
-    vld1.u32        d3[0], [r6], r1
-    pld [r6]
-    vld1.u32        d3[1], [r6], r1
+    vld1.u32        d0[0], [lr], r1
+    vld1.u32        d0[1], [lr], r1
+    vld1.u32        d1[0], [lr], r1
+    vld1.u32        d1[1], [lr], r1
+    vld1.u32        d2[0], [lr], r1
+    vld1.u32        d2[1], [lr], r1
+    vld1.u32        d3[0], [lr], r1
+    vld1.u32        d3[1], [lr], r1
 
     veor.u8         q9, q9
 
@@ -795,8 +787,7 @@
     subs            r4, #1
     bne             .loop_4x\h
 
-    pop             {r4, r5, r6}
-    bx              lr
+    pop             {r4, r5, pc}
     .ltorg
 endfunc
 .endm



More information about the x265-devel mailing list