[x265] [PATCH 2 of 2] arm: Implement interp_8tap_vert_ps_NxN NEON
Deepthi Nandakumar
deepthi at multicorewareinc.com
Wed Mar 23 07:34:32 CET 2016
Generally, developers can include the perf gain from the testbench (for
their particular primitive) within the commit message.
On Wed, Mar 23, 2016 at 10:26 AM, Ramya Sriraman <ramya at multicorewareinc.com
> wrote:
> On an average of 1.5x to 3x depending on the size.
> This is the console output after running testbench for all the vertical
> interpolation filter patches.
>
> Primitive--------------speedup------cpu cycles-----
>
> luma_vpp[ 4x4] 1.20x 2.88 3.45
> luma_vps[ 4x4] 1.25x 2.82 3.52
> luma_vsp[ 4x4] 1.29x 2.79 3.60
>
> luma_vpp[ 8x8] 2.02x 3.67 7.42
> luma_vps[ 8x8] 2.01x 3.69 7.42
> luma_vsp[ 8x8] 2.05x 3.63 7.43
>
> luma_vpp[16x16] 2.70x 8.81 23.83
> luma_vps[16x16] 2.68x 8.81 23.57
> luma_vsp[16x16] 2.80x 8.53 23.87
>
> luma_vpp[32x32] 2.95x 28.97 85.51
> luma_vps[32x32] 2.94x 28.86 84.85
> luma_vsp[32x32] 3.11x 27.74 86.27
>
> luma_vpp[64x64] 3.01x 109.40 328.88
> luma_vps[64x64] 2.99x 110.63 330.28
> luma_vsp[64x64] 3.08x 108.43 333.58
>
> luma_vpp[ 8x4] 1.76x 2.77 4.87
> luma_vps[ 8x4] 1.71x 2.79 4.76
> luma_vsp[ 8x4] 1.73x 2.76 4.77
>
> luma_vpp[ 4x8] 1.19x 3.85 4.58
> luma_vps[ 4x8] 1.34x 3.67 4.93
> luma_vsp[ 4x8] 1.38x 3.59 4.95
>
> luma_vpp[ 16x8] 2.40x 5.40 12.95
> luma_vps[ 16x8] 2.38x 5.40 12.85
> luma_vsp[ 16x8] 2.43x 5.29 12.87
>
> luma_vpp[ 8x16] 2.33x 5.50 12.81
> luma_vps[ 8x16] 2.24x 5.70 12.79
> luma_vsp[ 8x16] 2.35x 5.46 12.84
>
> luma_vpp[32x16] 2.82x 15.47 43.56
> luma_vps[32x16] 2.82x 15.44 43.46
> luma_vsp[32x16] 2.96x 14.90 44.10
>
> luma_vpp[16x32] 2.91x 15.67 45.63
> luma_vps[16x32] 2.92x 15.56 45.42
> luma_vsp[16x32] 3.00x 15.10 45.34
>
> luma_vpp[64x32] 2.97x 55.51 165.10
> luma_vps[64x32] 2.96x 55.54 164.41
> luma_vsp[64x32] 3.15x 52.93 166.78
>
> luma_vpp[32x64] 3.02x 56.05 169.12
> luma_vps[32x64] 3.00x 56.30 168.71
> luma_vsp[32x64] 3.11x 55.16 171.32
>
> luma_vpp[16x12] 2.61x 7.08 18.51
> luma_vps[16x12] 2.58x 7.08 18.28
> luma_vsp[16x12] 2.68x 6.89 18.45
>
> luma_vpp[12x16] 2.05x 8.68 17.75
> luma_vps[12x16] 2.13x 8.39 17.86
> luma_vsp[12x16] 2.10x 8.49 17.86
>
> luma_vpp[ 16x4] 2.07x 3.62 7.49
> luma_vps[ 16x4] 2.05x 3.59 7.38
> luma_vsp[ 16x4] 2.14x 3.53 7.56
>
> luma_vpp[ 4x16] 1.35x 5.81 7.85
> luma_vps[ 4x16] 1.41x 5.50 7.78
> luma_vsp[ 4x16] 1.49x 5.32 7.90
>
> luma_vpp[32x24] 2.92x 22.19 64.76
> luma_vps[32x24] 2.91x 22.10 64.22
> luma_vsp[32x24] 3.05x 21.24 64.68
>
> luma_vpp[24x32] 2.94x 22.31 65.56
> luma_vps[24x32] 2.91x 22.36 65.07
> luma_vsp[24x32] 3.07x 21.51 66.02
>
> luma_vpp[ 32x8] 2.62x 8.75 22.87
> luma_vps[ 32x8] 2.63x 8.64 22.75
> luma_vsp[ 32x8] 2.74x 8.36 22.89
>
> luma_vpp[ 8x32] 2.59x 8.98 23.27
> luma_vps[ 8x32] 2.53x 9.34 23.60
> luma_vsp[ 8x32] 2.62x 8.97 23.54
>
> luma_vpp[64x48] 2.99x 82.53 247.04
> luma_vps[64x48] 2.99x 82.54 246.76
> luma_vsp[64x48] 3.12x 79.86 249.17
>
> luma_vpp[48x64] 3.01x 82.74 248.97
> luma_vps[48x64] 3.00x 82.95 248.84
> luma_vsp[48x64] 3.14x 80.15 251.90
>
> luma_vpp[64x16] 2.91x 28.75 83.63
> luma_vps[64x16] 2.91x 28.77 83.57
> luma_vsp[64x16] 3.08x 27.31 84.02
>
> luma_vpp[16x64] 3.04x 29.37 89.35
> luma_vps[16x64] 3.04x 29.14 88.66
> luma_vsp[16x64] 3.17x 28.21 89.31
>
>
>
> Thank you
> Regards
> Ramya
>
> On Tue, Mar 22, 2016 at 7:21 PM, Pradeep Ramachandran <
> pradeep at multicorewareinc.com> wrote:
>
>> What is the improvement in cycles that we see from testbench from this
>> patch?
>>
>> Pradeep Ramachandran, PhD
>> Solution Architect at www.multicorewareinc.com/
>> Visiting Professor at www.cse.iitm.ac.in/
>> pradeeprama.info/
>> Ph: +91 99627 82018
>>
>> On Tue, Mar 22, 2016 at 6:57 PM, <ramya at multicorewareinc.com> wrote:
>>
>>> # HG changeset patch
>>> # User Ramya Sriraman<ramya at multicorewareinc.com>
>>> # Date 1458652316 -19800
>>> # Tue Mar 22 18:41:56 2016 +0530
>>> # Node ID fd95ed60b242adffbeb0991609271c8a15040ff9
>>> # Parent a9014e51d47ee5cdfe381d02526b1c94082cd4bf
>>> arm: Implement interp_8tap_vert_ps_NxN NEON
>>>
>>> diff -r a9014e51d47e -r fd95ed60b242 source/common/arm/asm-primitives.cpp
>>> --- a/source/common/arm/asm-primitives.cpp Tue Mar 22 11:10:43 2016
>>> +0530
>>> +++ b/source/common/arm/asm-primitives.cpp Tue Mar 22 18:41:56 2016
>>> +0530
>>> @@ -354,6 +354,32 @@
>>> p.pu[LUMA_24x32].luma_vsp =
>>> PFX(interp_8tap_vert_sp_24x32_neon);
>>> p.pu[LUMA_48x64].luma_vsp =
>>> PFX(interp_8tap_vert_sp_48x64_neon);
>>> p.pu[LUMA_12x16].luma_vsp =
>>> PFX(interp_8tap_vert_sp_12x16_neon);
>>> +
>>> + p.pu[LUMA_4x4].luma_vps = PFX(interp_8tap_vert_ps_4x4_neon);
>>> + p.pu[LUMA_4x8].luma_vps = PFX(interp_8tap_vert_ps_4x8_neon);
>>> + p.pu[LUMA_4x16].luma_vps =
>>> PFX(interp_8tap_vert_ps_4x16_neon);
>>> + p.pu[LUMA_8x4].luma_vps = PFX(interp_8tap_vert_ps_8x4_neon);
>>> + p.pu[LUMA_8x8].luma_vps = PFX(interp_8tap_vert_ps_8x8_neon);
>>> + p.pu[LUMA_8x16].luma_vps =
>>> PFX(interp_8tap_vert_ps_8x16_neon);
>>> + p.pu[LUMA_8x32].luma_vps =
>>> PFX(interp_8tap_vert_ps_8x32_neon);
>>> + p.pu[LUMA_16x4].luma_vps =
>>> PFX(interp_8tap_vert_ps_16x4_neon);
>>> + p.pu[LUMA_16x8].luma_vps =
>>> PFX(interp_8tap_vert_ps_16x8_neon);
>>> + p.pu[LUMA_16x16].luma_vps =
>>> PFX(interp_8tap_vert_ps_16x16_neon);
>>> + p.pu[LUMA_16x32].luma_vps =
>>> PFX(interp_8tap_vert_ps_16x32_neon);
>>> + p.pu[LUMA_16x64].luma_vps =
>>> PFX(interp_8tap_vert_ps_16x64_neon);
>>> + p.pu[LUMA_16x12].luma_vps =
>>> PFX(interp_8tap_vert_ps_16x12_neon);
>>> + p.pu[LUMA_32x8].luma_vps =
>>> PFX(interp_8tap_vert_ps_32x8_neon);
>>> + p.pu[LUMA_32x16].luma_vps =
>>> PFX(interp_8tap_vert_ps_32x16_neon);
>>> + p.pu[LUMA_32x32].luma_vps =
>>> PFX(interp_8tap_vert_ps_32x32_neon);
>>> + p.pu[LUMA_32x64].luma_vps =
>>> PFX(interp_8tap_vert_ps_32x64_neon);
>>> + p.pu[LUMA_32x24].luma_vps =
>>> PFX(interp_8tap_vert_ps_32x24_neon);
>>> + p.pu[LUMA_64x16].luma_vps =
>>> PFX(interp_8tap_vert_ps_64x16_neon);
>>> + p.pu[LUMA_64x32].luma_vps =
>>> PFX(interp_8tap_vert_ps_64x32_neon);
>>> + p.pu[LUMA_64x64].luma_vps =
>>> PFX(interp_8tap_vert_ps_64x64_neon);
>>> + p.pu[LUMA_64x48].luma_vps =
>>> PFX(interp_8tap_vert_ps_64x48_neon);
>>> + p.pu[LUMA_24x32].luma_vps =
>>> PFX(interp_8tap_vert_ps_24x32_neon);
>>> + p.pu[LUMA_48x64].luma_vps =
>>> PFX(interp_8tap_vert_ps_48x64_neon);
>>> + p.pu[LUMA_12x16].luma_vps =
>>> PFX(interp_8tap_vert_ps_12x16_neon);
>>> }
>>> if (cpuMask & X265_CPU_ARMV6)
>>> {
>>> diff -r a9014e51d47e -r fd95ed60b242 source/common/arm/ipfilter8.S
>>> --- a/source/common/arm/ipfilter8.S Tue Mar 22 11:10:43 2016 +0530
>>> +++ b/source/common/arm/ipfilter8.S Tue Mar 22 18:41:56 2016 +0530
>>> @@ -698,7 +698,7 @@
>>> bgt .loop_filterP2S_48x64
>>> bx lr
>>> endfunc
>>> -
>>> +//**************luma_vpp************
>>> .macro LUMA_VPP_4xN h
>>> function x265_interp_8tap_vert_pp_4x\h\()_neon
>>> push {r4, r5, r6}
>>> @@ -1606,4 +1606,333 @@
>>> pop {r4, r5, r6, r7}
>>> bx lr
>>> endfunc
>>> +//**************luma_vps*****************
>>> +.macro LUMA_VPS_4xN h
>>> +function x265_interp_8tap_vert_ps_4x\h\()_neon
>>> + push {r4, r5, r6}
>>> + ldr r4, [sp, #4 * 3]
>>> + lsl r3, #1
>>> + mov r5, r4, lsl #6
>>> + mov r4, r1, lsl #2
>>> + sub r4, r1
>>> + sub r0, r4
>>>
>>> + mov r4, #8192
>>> + vdup.32 q8, r4
>>> + mov r4, #\h
>>> +
>>> +.loop_vps_4x\h:
>>> + movrel r12, g_lumaFilter
>>> + add r12, r5
>>> + mov r6, r0
>>> +
>>> + pld [r6]
>>> + vld1.u32 d0[0], [r6], r1
>>> + pld [r6]
>>> + vld1.u32 d0[1], [r6], r1
>>> + pld [r6]
>>> + vld1.u32 d1[0], [r6], r1
>>> + pld [r6]
>>> + vld1.u32 d1[1], [r6], r1
>>> + pld [r6]
>>> + vld1.u32 d2[0], [r6], r1
>>> + pld [r6]
>>> + vld1.u32 d2[1], [r6], r1
>>> + pld [r6]
>>> + vld1.u32 d3[0], [r6], r1
>>> + pld [r6]
>>> + vld1.u32 d3[1], [r6], r1
>>> +
>>> + veor.u8 q9, q9
>>> +
>>> + vmovl.u8 q11, d0
>>> + vmovl.u16 q12, d22
>>> + vmovl.u16 q13, d23
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q12, q10
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q13, q10
>>> +
>>> + vmovl.u8 q11, d1
>>> + vmovl.u16 q12, d22
>>> + vmovl.u16 q13, d23
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q12, q10
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q13, q10
>>> +
>>> + vmovl.u8 q11, d2
>>> + vmovl.u16 q12, d22
>>> + vmovl.u16 q13, d23
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q12, q10
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q13, q10
>>> +
>>> + vmovl.u8 q11, d3
>>> + vmovl.u16 q12, d22
>>> + vmovl.u16 q13, d23
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q12, q10
>>> + vld1.s32 d20, [r12]!
>>> + vmov.s32 d21, d20
>>> + vmla.s32 q9, q13, q10
>>> +
>>> + vsub.s32 q9, q8
>>> + vqmovn.s32 d0, q9
>>> + vst1.u16 d0, [r2], r3
>>> +
>>> + add r0, r1
>>> + subs r4, #1
>>> + bne .loop_vps_4x\h
>>> +
>>> + pop {r4, r5, r6}
>>> + bx lr
>>> + .ltorg
>>> +endfunc
>>> +.endm
>>> +
>>> +LUMA_VPS_4xN 4
>>> +LUMA_VPS_4xN 8
>>> +LUMA_VPS_4xN 16
>>> +
>>> +
>>> +.macro FILTER_VPS a b filterv
>>> +
>>> +.loop_ps_\filterv\()_\a\()x\b:
>>> +
>>> + mov r7, r2
>>> + mov r6, r0
>>> + eor r8, r8
>>> +
>>> +.loop_ps_w8_\filterv\()_\a\()x\b:
>>> +
>>> + add r6, r0, r8
>>> +
>>> + pld [r6]
>>> + vld1.u8 d0, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d1, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d2, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d3, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d4, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d5, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d6, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d7, [r6], r1
>>> +
>>> + veor.u8 q9, q9
>>> + veor.u8 q10, q10
>>> +
>>> + \filterv
>>> +
>>> + mov r12,#8192
>>> + vdup.32 q8, r12
>>> + vsub.s32 q9, q8
>>> + vqmovn.s32 d0, q9
>>> + vsub.s32 q10, q8
>>> + vqmovn.s32 d1, q10
>>> + vst1.u16 {q0}, [r7]!
>>> +
>>> + add r8, #8
>>> + cmp r8, #\a
>>> + blt .loop_ps_w8_\filterv\()_\a\()x\b
>>> +
>>> + add r0, r1
>>> + add r2, r3
>>> + subs r4, #1
>>> + bne .loop_ps_\filterv\()_\a\()x\b
>>> +
>>> +.endm
>>> +
>>> +.macro LUMA_VPS w h
>>> +function x265_interp_8tap_vert_ps_\w\()x\h\()_neon
>>> +
>>> + push {r4, r5, r6, r7, r8}
>>> + ldr r5, [sp, #4 * 5]
>>> + lsl r3, #1
>>> + mov r4, r1, lsl #2
>>> + sub r4, r1
>>> + sub r0, r4
>>> + mov r4, #\h
>>> +
>>> + cmp r5, #0
>>> + beq 0f
>>> + cmp r5, #1
>>> + beq 1f
>>> + cmp r5, #2
>>> + beq 2f
>>> + cmp r5, #3
>>> + beq 3f
>>> +0:
>>> + FILTER_VPS \w \h qpel_filter_0_32b
>>> + b 5f
>>> +1:
>>> + FILTER_VPS \w \h qpel_filter_1_32b
>>> + b 5f
>>> +2:
>>> + FILTER_VPS \w \h qpel_filter_2_32b
>>> + b 5f
>>> +3:
>>> + FILTER_VPS \w \h qpel_filter_3_32b
>>> + b 5f
>>> +5:
>>> + pop {r4, r5, r6, r7, r8}
>>> + bx lr
>>> +endfunc
>>> +.endm
>>> +
>>> +LUMA_VPS 8 4
>>> +LUMA_VPS 8 8
>>> +LUMA_VPS 8 16
>>> +LUMA_VPS 8 32
>>> +LUMA_VPS 16 4
>>> +LUMA_VPS 16 8
>>> +LUMA_VPS 16 16
>>> +LUMA_VPS 16 32
>>> +LUMA_VPS 16 64
>>> +LUMA_VPS 16 12
>>> +LUMA_VPS 32 8
>>> +LUMA_VPS 32 16
>>> +LUMA_VPS 32 32
>>> +LUMA_VPS 32 64
>>> +LUMA_VPS 32 24
>>> +LUMA_VPS 64 16
>>> +LUMA_VPS 64 32
>>> +LUMA_VPS 64 64
>>> +LUMA_VPS 64 48
>>> +LUMA_VPS 24 32
>>> +LUMA_VPS 48 64
>>> +
>>> +function x265_interp_8tap_vert_ps_12x16_neon
>>> + push {r4, r5, r6, r7}
>>> + lsl r3, #1
>>> + ldr r5, [sp, #4 * 4]
>>> + mov r4, r1, lsl #2
>>> + sub r4, r1
>>> + sub r0, r4
>>> +
>>> + mov r4, #16
>>> +.loop_vps_12x16:
>>> +
>>> + mov r6, r0
>>> + mov r7, r2
>>> +
>>> + pld [r6]
>>> + vld1.u8 d0, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d1, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d2, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d3, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d4, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d5, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d6, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d7, [r6], r1
>>> +
>>> + veor.u8 q9, q9
>>> + veor.u8 q10, q10
>>> +
>>> + cmp r5,#0
>>> + beq 0f
>>> + cmp r5,#1
>>> + beq 1f
>>> + cmp r5,#2
>>> + beq 2f
>>> + cmp r5,#3
>>> + beq 3f
>>> +0:
>>> + qpel_filter_0_32b
>>> + b 5f
>>> +1:
>>> + qpel_filter_1_32b
>>> + b 5f
>>> +2:
>>> + qpel_filter_2_32b
>>> + b 5f
>>> +3:
>>> + qpel_filter_3_32b
>>> + b 5f
>>> +5:
>>> + mov r12,#8192
>>> + vdup.32 q8, r12
>>> + vsub.s32 q9, q8
>>> + vqmovn.s32 d0, q9
>>> + vsub.s32 q10, q8
>>> + vqmovn.s32 d1, q10
>>> + vst1.u8 {q0}, [r7]!
>>> +
>>> + add r6, r0, #8
>>> +
>>> + pld [r6]
>>> + vld1.u8 d0, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d1, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d2, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d3, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d4, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d5, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d6, [r6], r1
>>> + pld [r6]
>>> + vld1.u8 d7, [r6], r1
>>> +
>>> + veor.u8 q9, q9
>>> + veor.u8 q10, q10
>>> +
>>> + cmp r5,#0
>>> + beq 0f
>>> + cmp r5,#1
>>> + beq 1f
>>> + cmp r5,#2
>>> + beq 2f
>>> + cmp r5,#3
>>> + beq 3f
>>> +0:
>>> + qpel_filter_0_32b
>>> + b 5f
>>> +1:
>>> + qpel_filter_1_32b
>>> + b 5f
>>> +2:
>>> + qpel_filter_2_32b
>>> + b 5f
>>> +3:
>>> + qpel_filter_3_32b
>>> + b 5f
>>> +5:
>>> + mov r12,#8192
>>> + vdup.32 q8, r12
>>> + vsub.s32 q9, q8
>>> + vqmovn.s32 d0, q9
>>> + vst1.u8 d0, [r7]!
>>> +
>>> + add r0, r1
>>> + add r2, r3
>>> + subs r4, #1
>>> + bne .loop_vps_12x16
>>> +
>>> + pop {r4, r5, r6, r7}
>>> + bx lr
>>> +endfunc
>>> diff -r a9014e51d47e -r fd95ed60b242 source/common/arm/ipfilter8.h
>>> --- a/source/common/arm/ipfilter8.h Tue Mar 22 11:10:43 2016 +0530
>>> +++ b/source/common/arm/ipfilter8.h Tue Mar 22 18:41:56 2016 +0530
>>> @@ -102,4 +102,30 @@
>>> void x265_interp_8tap_vert_sp_24x32_neon(const int16_t* src, intptr_t
>>> srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
>>> void x265_interp_8tap_vert_sp_48x64_neon(const int16_t* src, intptr_t
>>> srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
>>> void x265_interp_8tap_vert_sp_12x16_neon(const int16_t* src, intptr_t
>>> srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
>>> +
>>> +void x265_interp_8tap_vert_ps_4x4_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_4x8_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_4x16_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_8x4_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_8x8_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_8x16_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_8x32_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_16x4_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_16x8_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_16x16_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_16x32_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_16x64_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_16x12_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_32x8_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_32x16_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_32x32_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_32x64_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_32x24_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_64x16_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_64x32_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_64x64_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_64x48_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_24x32_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_48x64_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> +void x265_interp_8tap_vert_ps_12x16_neon(const pixel* src, intptr_t
>>> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
>>> #endif // ifndef X265_IPFILTER8_ARM_H
>>> _______________________________________________
>>> x265-devel mailing list
>>> x265-devel at videolan.org
>>> https://mailman.videolan.org/listinfo/x265-devel
>>>
>>
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
--
Deepthi Nandakumar
Engineering Manager, x265
Multicoreware, Inc
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160323/480a09f7/attachment-0001.html>
More information about the x265-devel
mailing list