[x265] [PATCH] arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon

chen chenm003 at 163.com
Mon Mar 14 13:13:18 CET 2016




At 2016-03-14 19:09:19,ramya at multicorewareinc.com wrote:
># HG changeset patch
># User Ramya Sriraman<ramya at multicorewareinc.com>
># Date 1457681682 -19800
>#      Fri Mar 11 13:04:42 2016 +0530
># Node ID c1d2fa2ca49d4027252bd52176ecbe2db0d0eddd
># Parent  0af38750a71aab5fe790993365aaaa3e209a7d5c
>arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
>
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp	Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp	Fri Mar 11 13:04:42 2016 +0530
>@@ -290,6 +290,14 @@
>         // planecopy
>         p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
> 
>+        // interpolation filters
>+        p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_neon);
>+        p.pu[LUMA_4x8].luma_vpp = PFX(interp_8tap_vert_pp_4x8_neon);
>+        p.pu[LUMA_4x16].luma_vpp = PFX(interp_8tap_vert_pp_4x16_neon);
>+        p.pu[LUMA_8x4].luma_vpp = PFX(interp_8tap_vert_pp_8x4_neon);
>+        p.pu[LUMA_8x8].luma_vpp = PFX(interp_8tap_vert_pp_8x8_neon);
>+        p.pu[LUMA_8x16].luma_vpp = PFX(interp_8tap_vert_pp_8x16_neon);
>+        p.pu[LUMA_8x32].luma_vpp = PFX(interp_8tap_vert_pp_8x32_neon);
>     }
>     if (cpuMask & X265_CPU_ARMV6)
>     {
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/ipfilter8.S
>--- a/source/common/arm/ipfilter8.S	Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/ipfilter8.S	Fri Mar 11 13:04:42 2016 +0530
>@@ -27,6 +27,8 @@
> 
> .align 4
> 
>+g_lumaFilter:
>+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0,-1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0,-1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1,0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1 
> .text
> 
> // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
>@@ -692,3 +694,207 @@
>     bgt         .loop_filterP2S_48x64
>     bx          lr
> endfunc
>+
>+.macro LUMA_VPP_4xN h
>+function x265_interp_8tap_vert_pp_4x\h\()_neon
>+    push            {r4, r5, r6}
>+    ldr             r4, [sp, #4 * 3]
>+    mov             r5, r4, lsl #6
>+    mov             r4, #3
>+    mul             r4, r1, r4
How about below:
mov r4, r1, lsl #2
sub r4, r1

Code are right, just performance problem, we may reference ffmpeg's hevcdsp_qpel_neon.S
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160314/70f4568b/attachment.html>


More information about the x265-devel mailing list