[x265] [PATCH] arm: Implement filterPixelToShort ARM NEON asm
Deepthi Nandakumar
deepthi at multicorewareinc.com
Fri Mar 4 07:33:02 CET 2016
This patch doesnt apply.
On Tue, Mar 1, 2016 at 5:46 PM, <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1456831820 -19800
> # Tue Mar 01 17:00:20 2016 +0530
> # Node ID 61e51faf9e7ee1c8056ac2f66cf51da104bfa106
> # Parent 79c00b9bc2b81afef2e41526fc3c390528f3174c
> arm: Implement filterPixelToShort ARM NEON asm
>
> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/CMakeLists.txt
> --- a/source/common/CMakeLists.txt Tue Mar 01 12:18:18 2016 +0530
> +++ b/source/common/CMakeLists.txt Tue Mar 01 17:00:20 2016 +0530
> @@ -89,7 +89,7 @@
> set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h
> dct8.h loopfilter.h)
>
> # add ARM assembly/intrinsic files here
> - set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S
> blockcopy8.S)
> + set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S
> blockcopy8.S ipfilter8.S)
> set(VEC_PRIMITIVES)
>
> set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/asm-primitives.cpp
> --- a/source/common/arm/asm-primitives.cpp Tue Mar 01 12:18:18 2016
> +0530
> +++ b/source/common/arm/asm-primitives.cpp Tue Mar 01 17:00:20 2016
> +0530
> @@ -33,6 +33,7 @@
> #include "blockcopy8.h"
> #include "pixel.h"
> #include "pixel-util.h"
> +#include "ipfilter8.h"
> }
>
> namespace X265_NS {
> @@ -42,6 +43,33 @@
> {
> if (cpuMask & X265_CPU_NEON)
> {
> + // filterPixelToShort
> + p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_neon);
> + p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_neon);
> + p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon);
> + p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_neon);
> + p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_neon);
> + p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon);
> + p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon);
> + p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
> + p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon);
> + p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon);
> + p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
> + p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
> + p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
> + p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
> + p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
> + p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon);
> + p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
> + p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
> + p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
> + p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
> + p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
> + p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
> + p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
> + p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
> + p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
> +
> // Block_fill
> p.cu[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon);
> p.cu[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon);
> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.S
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/arm/ipfilter8.S Tue Mar 01 17:00:20 2016 +0530
> @@ -0,0 +1,694 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2016 x265 project
> + *
> + * Authors: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst,
> intptr_t dstStride)
> +function x265_filterPixelToShort_4x4_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 2
> + vld1.u8 {d0}, [r0], r1
> + vld1.u8 {d2}, [r0], r1
> + vmovl.u8 q0, d0
> + vmovl.u8 q1, d2
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {d4}, [r2], r3
> + vst1.16 {d6}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_4x8_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 4
> + vld1.u8 {d0}, [r0], r1
> + vld1.u8 {d2}, [r0], r1
> + vmovl.u8 q0, d0
> + vmovl.u8 q1, d2
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {d4}, [r2], r3
> + vst1.16 {d6}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_4x16_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 8
> + vld1.u8 {d0}, [r0], r1
> + vld1.u8 {d2}, [r0], r1
> + vmovl.u8 q0, d0
> + vmovl.u8 q1, d2
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {d4}, [r2], r3
> + vst1.16 {d6}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_8x4_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 2
> + vld1.u8 {d0}, [r0], r1
> + vld1.u8 {d2}, [r0], r1
> + vmovl.u8 q0, d0
> + vmovl.u8 q1, d2
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2}, [r2], r3
> + vst1.16 {q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_8x8_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 4
> + vld1.u8 {d0}, [r0], r1
> + vld1.u8 {d2}, [r0], r1
> + vmovl.u8 q0, d0
> + vmovl.u8 q1, d2
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2}, [r2], r3
> + vst1.16 {q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_8x16_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 8
> + vld1.u8 {d0}, [r0], r1
> + vld1.u8 {d2}, [r0], r1
> + vmovl.u8 q0, d0
> + vmovl.u8 q1, d2
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2}, [r2], r3
> + vst1.16 {q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_8x32_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 16
> + vld1.u8 {d0}, [r0], r1
> + vld1.u8 {d2}, [r0], r1
> + vmovl.u8 q0, d0
> + vmovl.u8 q1, d2
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2}, [r2], r3
> + vst1.16 {q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_12x16_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 16
> + vld1.u8 {d2-d3}, [r0], r1
> + vmovl.u8 q0, d2
> + vmovl.u8 q1, d3
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {d4, d5, d6}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_16x4_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 4
> + vld1.u8 {d2-d3}, [r0], r1
> + vmovl.u8 q0, d2
> + vmovl.u8 q1, d3
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_16x8_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 8
> + vld1.u8 {d2-d3}, [r0], r1
> + vmovl.u8 q0, d2
> + vmovl.u8 q1, d3
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_16x12_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 12
> + vld1.u8 {d2-d3}, [r0], r1
> + vmovl.u8 q0, d2
> + vmovl.u8 q1, d3
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_16x16_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 16
> + vld1.u8 {d2-d3}, [r0], r1
> + vmovl.u8 q0, d2
> + vmovl.u8 q1, d3
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_16x32_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 32
> + vld1.u8 {d2-d3}, [r0], r1
> + vmovl.u8 q0, d2
> + vmovl.u8 q1, d3
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_16x64_neon
> + add r3, r3
> + vmov.u16 q8, #64
> + vmov.u16 q9, #8192
> + vneg.s16 q9, q9
> +.rept 64
> + vld1.u8 {d2-d3}, [r0], r1
> + vmovl.u8 q0, d2
> + vmovl.u8 q1, d3
> + vmov q2, q9
> + vmov q3, q9
> + vmla.s16 q2, q0, q8
> + vmla.s16 q3, q1, q8
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_24x32_neon
> + add r3, r3
> + sub r3, #32
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> +.rept 32
> + vld1.u8 {d18, d19, d20}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmla.s16 q2, q11, q0
> + vst1.16 {q2}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_32x8_neon
> + add r3, r3
> + sub r3, #32
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> +.rept 8
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_32x16_neon
> + add r3, r3
> + sub r3, #32
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #8
> +.loop_filterP2S_32x16:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_32x16
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_32x24_neon
> + add r3, r3
> + sub r3, #32
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #12
> +.loop_filterP2S_32x24:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_32x24
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_32x32_neon
> + add r3, r3
> + sub r3, #32
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #16
> +.loop_filterP2S_32x32:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_32x32
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_32x64_neon
> + add r3, r3
> + sub r3, #32
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #32
> +.loop_filterP2S_32x64:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_32x64
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_64x16_neon
> + add r3, r3
> + sub r1, #32
> + sub r3, #96
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #8
> +.loop_filterP2S_64x16:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0]!
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2]!
> +
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_64x16
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_64x32_neon
> + add r3, r3
> + sub r1, #32
> + sub r3, #96
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #16
> +.loop_filterP2S_64x32:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0]!
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2]!
> +
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_64x32
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_64x48_neon
> + add r3, r3
> + sub r1, #32
> + sub r3, #96
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #24
> +.loop_filterP2S_64x48:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0]!
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2]!
> +
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_64x48
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_64x64_neon
> + add r3, r3
> + sub r1, #32
> + sub r3, #96
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #32
> +.loop_filterP2S_64x64:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0]!
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2]!
> +
> + vld1.u8 {q9-q10}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_64x64
> + bx lr
> +endfunc
> +
> +function x265_filterPixelToShort_48x64_neon
> + add r3, r3
> + sub r1, #32
> + sub r3, #64
> + vmov.u16 q0, #64
> + vmov.u16 q1, #8192
> + vneg.s16 q1, q1
> + mov r12, #32
> +.loop_filterP2S_48x64:
> + subs r12, #1
> +.rept 2
> + vld1.u8 {q9-q10}, [r0]!
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmovl.u8 q11, d20
> + vmovl.u8 q10, d21
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2]!
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q11, q0
> + vmla.s16 q3, q10, q0
> + vst1.16 {q2-q3}, [r2]!
> +
> + vld1.u8 {q9}, [r0], r1
> + vmovl.u8 q8, d18
> + vmovl.u8 q9, d19
> + vmov q2, q1
> + vmov q3, q1
> + vmla.s16 q2, q8, q0
> + vmla.s16 q3, q9, q0
> + vst1.16 {q2-q3}, [r2], r3
> +.endr
> + bgt .loop_filterP2S_48x64
> + bx lr
> +endfunc
> diff -r 79c00b9bc2b8 -r 61e51faf9e7e source/common/arm/ipfilter8.h
> --- a/source/common/arm/ipfilter8.h Tue Mar 01 12:18:18 2016 +0530
> +++ b/source/common/arm/ipfilter8.h Tue Mar 01 17:00:20 2016 +0530
> @@ -25,4 +25,30 @@
> #ifndef X265_IPFILTER8_ARM_H
> #define X265_IPFILTER8_ARM_H
>
> +void x265_filterPixelToShort_4x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_4x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_4x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_8x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_8x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_8x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_8x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_12x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_16x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_16x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_16x12_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_16x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_16x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_16x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_24x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_32x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_32x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_32x24_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_32x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_32x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_48x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_64x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_64x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_64x48_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +void x265_filterPixelToShort_64x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride);
> +
> #endif // ifndef X265_IPFILTER8_ARM_H
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Deepthi Nandakumar
Engineering Manager, x265
Multicoreware, Inc
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160304/02ee3863/attachment-0001.html>
More information about the x265-devel
mailing list