[x265] [PATCH] arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
Ramya Sriraman
ramya at multicorewareinc.com
Tue Mar 15 14:05:34 CET 2016
Thanks min,
I referred to ffmpeg's NEON version of the filter and tried something
similar in design for 8x8 for a start. Right now, the filter works for
coefficient index 0 and 1 only. Is this what you had in mind for better
performance ?
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
.macro qpel_filter_1
vmov.i16 d16, #58
vmovl.u8 q11, d3
vmull.s16 q9, d22, d16 // 58 * d0
vmull.s16 q10, d23, d16 // 58 * d1
vmov.i16 d17, #10
vmovl.u8 q13, d2
vmull.s16 q11, d26, d17 // 10 * c0
vmull.s16 q12, d27, d17 // 10 * c1
vmov.i16 d16, #17
vmovl.u8 q15, d4
vmull.s16 q13, d30, d16 // 17 * e0
vmull.s16 q14, d31, d16 // 17 * e1
vmov.i16 d17, #5
vmovl.u8 q1, d5
vmull.s16 q15, d2, d17 // 5 * f0
vmull.s16 q8, d3, d17 // 5 * f1
vsub.s32 q9, q11 // 58 * d0 - 10 * c0
vsub.s32 q10, q12 // 58 * d1 - 10 * c1
vmovl.u8 q1, d1
vshll.s16 q11, d2, #2 // 4 * b0
vshll.s16 q12, d3, #2 // 4 * b1
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1
vmovl.u8 q1, d0
vmovl.u8 q2, d6
vsubl.s16 q13, d4, d2 // g0 - a0
vsubl.s16 q14, d5, d3 // g1 - a1
vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
vsub.s32 q13, q15 // g0 - a0 - 5 * f0
vsub.s32 q14, q8 // g1 - a1 - 5 * f1
vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 +
g0 - a0 - 5 * f0
vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 +
g1 - a1 - 5 * f1
.endm
.macro qpel_filter_2
.endm
.macro qpel_filter_3
.endm
.macro LUMA_VPP_8xN h
function x265_interp_8tap_vert_pp_8x\h\()_neon
push {r4, r5, r6, r7}
ldr r5, [sp, #4 * 4]
mov r4, #3
mul r4, r1, r4
sub r0, r4
mov r4, #\h
.loop_8x\h:
mov r6,r0
pld [r6]
vld1.u8 d0, [r6], r1
pld [r6]
vld1.u8 d1, [r6], r1
pld [r6]
vld1.u8 d2, [r6], r1
pld [r6]
vld1.u8 d3, [r6], r1
pld [r6]
vld1.u8 d4, [r6], r1
pld [r6]
vld1.u8 d5, [r6], r1
pld [r6]
vld1.u8 d6, [r6], r1
pld [r6]
vld1.u8 d7, [r6], r1
veor.u8 q9, q9
veor.u8 q10, q10
cmp r5,#0
beq 0f
cmp r5,#1
beq 1f
cmp r5,#2
beq 2f
cmp r5,#3
beq 3f
1:
qpel_filter_1
2:
qpel_filter_2
3:
qpel_filter_3
0:
vmov.i16 d17, #64
vmovl.u8 q11, d3
vmull.s16 q9, d22, d17 // 64*d0
vmull.s16 q10, d23, d17 // 64*d1
mov r12,#32
vdup.32 q8, r12
vadd.s32 q9, q8
vqshrun.s32 d0, q9, #6
vadd.s32 q10, q8
vqshrun.s32 d1, q10, #6
vqmovn.u16 d0, q0
vst1.u8 d0, [r2], r3
add r0, r1
subs r4, #1
bne .loop_8x\h
pop {r4, r5, r6, r7}
bx lr
endfunc
.endm
LUMA_VPP_8xN 8
Thank you
Regards
Ramya
On Mon, Mar 14, 2016 at 5:43 PM, chen <chenm003 at 163.com> wrote:
>
>
> At 2016-03-14 19:09:19,ramya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Ramya Sriraman<ramya at multicorewareinc.com>
> ># Date 1457681682 -19800
> ># Fri Mar 11 13:04:42 2016 +0530
> ># Node ID c1d2fa2ca49d4027252bd52176ecbe2db0d0eddd
> ># Parent 0af38750a71aab5fe790993365aaaa3e209a7d5c
> >arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
> >
> >diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/asm-primitives.cpp
> >--- a/source/common/arm/asm-primitives.cpp Thu Mar 10 21:43:35 2016 +0530
> >+++ b/source/common/arm/asm-primitives.cpp Fri Mar 11 13:04:42 2016 +0530
> >@@ -290,6 +290,14 @@
> > // planecopy
> > p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
> >
> >+ // interpolation filters
> >+ p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_neon);
> >+ p.pu[LUMA_4x8].luma_vpp = PFX(interp_8tap_vert_pp_4x8_neon);
> >+ p.pu[LUMA_4x16].luma_vpp = PFX(interp_8tap_vert_pp_4x16_neon);
> >+ p.pu[LUMA_8x4].luma_vpp = PFX(interp_8tap_vert_pp_8x4_neon);
> >+ p.pu[LUMA_8x8].luma_vpp = PFX(interp_8tap_vert_pp_8x8_neon);
> >+ p.pu[LUMA_8x16].luma_vpp = PFX(interp_8tap_vert_pp_8x16_neon);
> >+ p.pu[LUMA_8x32].luma_vpp = PFX(interp_8tap_vert_pp_8x32_neon);
> > }
> > if (cpuMask & X265_CPU_ARMV6)
> > {
> >diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/ipfilter8.S
> >--- a/source/common/arm/ipfilter8.S Thu Mar 10 21:43:35 2016 +0530
> >+++ b/source/common/arm/ipfilter8.S Fri Mar 11 13:04:42 2016 +0530
> >@@ -27,6 +27,8 @@
> >
> > .align 4
> >
> >+g_lumaFilter:
> >+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0,-1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0,-1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1,0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1
> > .text
> >
> > // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
> >@@ -692,3 +694,207 @@
> > bgt .loop_filterP2S_48x64
> > bx lr
> > endfunc
> >+
> >+.macro LUMA_VPP_4xN h
> >+function x265_interp_8tap_vert_pp_4x\h\()_neon
> >+ push {r4, r5, r6}
> >+ ldr r4, [sp, #4 * 3]
> >+ mov r5, r4, lsl #6
> >+ mov r4, #3
> >+ mul r4, r1, r4
> How about below:
> mov r4, r1, lsl #2
> sub r4, r1
>
> Code are right, just performance problem, we may reference ffmpeg's hevcdsp_qpel_neon.S
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160315/4b16d822/attachment-0001.html>
More information about the x265-devel
mailing list