<div dir="ltr"><div>Thanks min,<br></div>I referred to ffmpeg's NEON version of the filter and tried something similar in design for 8x8 for a start. Right now, the filter works for coefficient index 0 and 1 only. Is this what you had in mind for better performance ? <br>---------------------------------------------------------------------------------------------------------------------------------------------------------------------<br>.macro qpel_filter_1<br> vmov.i16 d16, #58<br> vmovl.u8 q11, d3<br> vmull.s16 q9, d22, d16 // 58 * d0<br> vmull.s16 q10, d23, d16 // 58 * d1<br><br> vmov.i16 d17, #10<br> vmovl.u8 q13, d2<br> vmull.s16 q11, d26, d17 // 10 * c0<br> vmull.s16 q12, d27, d17 // 10 * c1<br><br> vmov.i16 d16, #17<br> vmovl.u8 q15, d4<br> vmull.s16 q13, d30, d16 // 17 * e0<br> vmull.s16 q14, d31, d16 // 17 * e1<br><br> vmov.i16 d17, #5<br> vmovl.u8 q1, d5<br> vmull.s16 q15, d2, d17 // 5 * f0<br> vmull.s16 q8, d3, d17 // 5 * f1<br><br> vsub.s32 q9, q11 // 58 * d0 - 10 * c0<br> vsub.s32 q10, q12 // 58 * d1 - 10 * c1<br><br> vmovl.u8 q1, d1<br> vshll.s16 q11, d2, #2 // 4 * b0<br> vshll.s16 q12, d3, #2 // 4 * b1<br><br> vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0<br> vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1<br><br> vmovl.u8 q1, d0<br> vmovl.u8 q2, d6<br> vsubl.s16 q13, d4, d2 // g0 - a0<br> vsubl.s16 q14, d5, d3 // g1 - a1<br><br> vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0<br> vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1<br> vsub.s32 q13, q15 // g0 - a0 - 5 * f0<br> vsub.s32 q14, q8 // g1 - a1 - 5 * f1<br> vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0<br> vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1<br>.endm<br>.macro qpel_filter_2<br> .endm<br>.macro qpel_filter_3<br>.endm<br>.macro LUMA_VPP_8xN h<br>function x265_interp_8tap_vert_pp_8x\h\()_neon<br> push {r4, r5, r6, r7}<br> ldr r5, [sp, #4 * 4]<br> mov r4, #3<br> mul r4, r1, r4<br> sub r0, r4<br> mov r4, #\h<br>.loop_8x\h:<br> mov r6,r0<br> pld [r6]<br> vld1.u8 d0, [r6], r1<br> pld [r6]<br> vld1.u8 d1, [r6], r1<br> pld [r6]<br> vld1.u8 d2, [r6], r1<br> pld [r6]<br> vld1.u8 d3, [r6], r1<br> pld [r6]<br> vld1.u8 d4, [r6], r1<br> pld [r6]<br> vld1.u8 d5, [r6], r1<br> pld [r6]<br> vld1.u8 d6, [r6], r1<br> pld [r6]<br> vld1.u8 d7, [r6], r1<br><br> veor.u8 q9, q9<br> veor.u8 q10, q10<br><br> cmp r5,#0<br> beq 0f<br> cmp r5,#1<br> beq 1f<br> cmp r5,#2<br> beq 2f<br> cmp r5,#3<br> beq 3f<br>1:<br> qpel_filter_1<br>2:<br> qpel_filter_2<br>3:<br> qpel_filter_3<br>0:<br> vmov.i16 d17, #64<br> vmovl.u8 q11, d3<br> vmull.s16 q9, d22, d17 // 64*d0<br> vmull.s16 q10, d23, d17 // 64*d1<br><br> mov r12,#32<br> vdup.32 q8, r12<br> vadd.s32 q9, q8<br> vqshrun.s32 d0, q9, #6<br> vadd.s32 q10, q8<br> vqshrun.s32 d1, q10, #6<br> vqmovn.u16 d0, q0<br> vst1.u8 d0, [r2], r3<br><br> add r0, r1<br> subs r4, #1<br> bne .loop_8x\h<br><br> pop {r4, r5, r6, r7}<br> bx lr<br>endfunc<br>.endm<br><br>LUMA_VPP_8xN 8<br><br></div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>
<br><div class="gmail_quote">On Mon, Mar 14, 2016 at 5:43 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><br><pre><div><div class="h5"><br>At 2016-03-14 19:09:19,<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a> wrote:
># HG changeset patch
># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>>
># Date 1457681682 -19800
># Fri Mar 11 13:04:42 2016 +0530
># Node ID c1d2fa2ca49d4027252bd52176ecbe2db0d0eddd
># Parent 0af38750a71aab5fe790993365aaaa3e209a7d5c
>arm: Implement interp_8tap_vert_pp_4xn_neon and interp_8tap_vert_pp_8xn_neon
>
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp Fri Mar 11 13:04:42 2016 +0530
>@@ -290,6 +290,14 @@
> // planecopy
> p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
>
>+ // interpolation filters
>+ p.pu[LUMA_4x4].luma_vpp = PFX(interp_8tap_vert_pp_4x4_neon);
>+ p.pu[LUMA_4x8].luma_vpp = PFX(interp_8tap_vert_pp_4x8_neon);
>+ p.pu[LUMA_4x16].luma_vpp = PFX(interp_8tap_vert_pp_4x16_neon);
>+ p.pu[LUMA_8x4].luma_vpp = PFX(interp_8tap_vert_pp_8x4_neon);
>+ p.pu[LUMA_8x8].luma_vpp = PFX(interp_8tap_vert_pp_8x8_neon);
>+ p.pu[LUMA_8x16].luma_vpp = PFX(interp_8tap_vert_pp_8x16_neon);
>+ p.pu[LUMA_8x32].luma_vpp = PFX(interp_8tap_vert_pp_8x32_neon);
> }
> if (cpuMask & X265_CPU_ARMV6)
> {
>diff -r 0af38750a71a -r c1d2fa2ca49d source/common/arm/ipfilter8.S
>--- a/source/common/arm/ipfilter8.S Thu Mar 10 21:43:35 2016 +0530
>+++ b/source/common/arm/ipfilter8.S Fri Mar 11 13:04:42 2016 +0530
>@@ -27,6 +27,8 @@
>
> .align 4
>
>+g_lumaFilter:
>+.word 0,0,0,0,0,0,64,64,0,0,0,0,0,0,0,0,-1,-1,4,4,-10,-10,58,58,17,17,-5,-5,1,1,0,0,-1,-1,4,4,-11,-11,40,40,40,40,-11,-11,4,4,-1,-1,0,0,1,1,-5,-5,17,17,58,58,-10,-10,4,4,-1,-1
> .text
>
> // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
>@@ -692,3 +694,207 @@
> bgt .loop_filterP2S_48x64
> bx lr
> endfunc
>+
>+.macro LUMA_VPP_4xN h
>+function x265_interp_8tap_vert_pp_4x\h\()_neon
>+ push {r4, r5, r6}
>+ ldr r4, [sp, #4 * 3]
>+ mov r5, r4, lsl #6
>+ mov r4, #3
>+ mul r4, r1, r4<br></div></div>How about below:<br>mov r4, r1, lsl #2<br>sub r4, r1<br>
<br>Code are right, just performance problem, we may reference ffmpeg's hevcdsp_qpel_neon.S
</pre></div><br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>