[x265] [PATCH] arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
chen
chenm003 at 163.com
Mon Mar 14 13:13:18 CET 2016
At 2016-03-14 19:09:19,ramya at multicorewareinc.com wrote:
># HG changeset patch
># User Ramya Sriraman<ramya at multicorewareinc.com>
># Date 1457681682 -19800
># Fri Mar 11 13:04:42 2016 +0530
># Node ID c1d2fa2ca49d4027252bd52176ecbe2db0d0eddd
># Parent 0af38750a71aab5fe790993365aaaa3e209a7d5c
>arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
>
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp Fri Mar 11 13:04:42 2016 +0530
>@@ -290,6 +290,14 @@
> // planecopy
> p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
>
>+ // interpolation filters
>+ p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_neon);
>+ p.pu[LUMA_4x8].luma_vpp = PFX(interp_8tap_vert_pp_4x8_neon);
>+ p.pu[LUMA_4x16].luma_vpp = PFX(interp_8tap_vert_pp_4x16_neon);
>+ p.pu[LUMA_8x4].luma_vpp = PFX(interp_8tap_vert_pp_8x4_neon);
>+ p.pu[LUMA_8x8].luma_vpp = PFX(interp_8tap_vert_pp_8x8_neon);
>+ p.pu[LUMA_8x16].luma_vpp = PFX(interp_8tap_vert_pp_8x16_neon);
>+ p.pu[LUMA_8x32].luma_vpp = PFX(interp_8tap_vert_pp_8x32_neon);
> }
> if (cpuMask & X265_CPU_ARMV6)
> {
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/ipfilter8.S
>--- a/source/common/arm/ipfilter8.S Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/ipfilter8.S Fri Mar 11 13:04:42 2016 +0530
>@@ -27,6 +27,8 @@
>
> .align 4
>
>+g_lumaFilter:
>+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0,-1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0,-1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1,0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1
> .text
>
> // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
>@@ -692,3 +694,207 @@
> bgt .loop_filterP2S_48x64
> bx lr
> endfunc
>+
>+.macro LUMA_VPP_4xN h
>+function x265_interp_8tap_vert_pp_4x\h\()_neon
>+ push {r4, r5, r6}
>+ ldr r4, [sp, #4 * 3]
>+ mov r5, r4, lsl #6
>+ mov r4, #3
>+ mul r4, r1, r4
How about below:
mov r4, r1, lsl #2
sub r4, r1
Code are right, just performance problem, we may reference ffmpeg's hevcdsp_qpel_neon.S
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160314/70f4568b/attachment.html>
More information about the x265-devel
mailing list