[x265] [PATCH] arm :Implement interp_8tap_horiz_pp ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Tue Mar 29 09:15:02 CEST 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1458032588 -19800
# Tue Mar 15 14:33:08 2016 +0530
# Node ID 960ecc63c686c40e5ed3302ff238a8fdd51cc854
# Parent 331fcf0d9d1e6f0a68284bf833c0ee6985cb0c33
arm :Implement interp_8tap_horiz_pp ARM NEON
diff -r 331fcf0d9d1e -r 960ecc63c686 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Thu Mar 24 15:25:37 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Tue Mar 15 14:33:08 2016 +0530
@@ -43,6 +43,33 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // luma_hpp
+ p.pu[LUMA_4x4].luma_hpp = PFX(interp_horiz_pp_4x4_neon);
+ p.pu[LUMA_4x8].luma_hpp = PFX(interp_horiz_pp_4x8_neon);
+ p.pu[LUMA_4x16].luma_hpp = PFX(interp_horiz_pp_4x16_neon);
+ p.pu[LUMA_8x4].luma_hpp = PFX(interp_horiz_pp_8x4_neon);
+ p.pu[LUMA_8x8].luma_hpp = PFX(interp_horiz_pp_8x8_neon);
+ p.pu[LUMA_8x16].luma_hpp = PFX(interp_horiz_pp_8x16_neon);
+ p.pu[LUMA_8x32].luma_hpp = PFX(interp_horiz_pp_8x32_neon);
+ p.pu[LUMA_12x16].luma_hpp = PFX(interp_horiz_pp_12x16_neon);
+ p.pu[LUMA_16x4].luma_hpp = PFX(interp_horiz_pp_16x4_neon);
+ p.pu[LUMA_16x8].luma_hpp = PFX(interp_horiz_pp_16x8_neon);
+ p.pu[LUMA_16x12].luma_hpp = PFX(interp_horiz_pp_16x12_neon);
+ p.pu[LUMA_16x16].luma_hpp = PFX(interp_horiz_pp_16x16_neon);
+ p.pu[LUMA_16x32].luma_hpp = PFX(interp_horiz_pp_16x32_neon);
+ p.pu[LUMA_16x64].luma_hpp = PFX(interp_horiz_pp_16x64_neon);
+ p.pu[LUMA_24x32].luma_hpp = PFX(interp_horiz_pp_24x32_neon);
+ p.pu[LUMA_32x8].luma_hpp = PFX(interp_horiz_pp_32x8_neon);
+ p.pu[LUMA_32x16].luma_hpp = PFX(interp_horiz_pp_32x16_neon);
+ p.pu[LUMA_32x24].luma_hpp = PFX(interp_horiz_pp_32x24_neon);
+ p.pu[LUMA_32x32].luma_hpp = PFX(interp_horiz_pp_32x32_neon);
+ p.pu[LUMA_32x64].luma_hpp = PFX(interp_horiz_pp_32x64_neon);
+ p.pu[LUMA_48x64].luma_hpp = PFX(interp_horiz_pp_48x64_neon);
+ p.pu[LUMA_64x16].luma_hpp = PFX(interp_horiz_pp_64x16_neon);
+ p.pu[LUMA_64x32].luma_hpp = PFX(interp_horiz_pp_64x32_neon);
+ p.pu[LUMA_64x48].luma_hpp = PFX(interp_horiz_pp_64x48_neon);
+ p.pu[LUMA_64x64].luma_hpp = PFX(interp_horiz_pp_64x64_neon);
+
// count nonzero
p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon);
p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon);
diff -r 331fcf0d9d1e -r 960ecc63c686 source/common/arm/ipfilter8.S
--- a/source/common/arm/ipfilter8.S Thu Mar 24 15:25:37 2016 +0530
+++ b/source/common/arm/ipfilter8.S Tue Mar 15 14:33:08 2016 +0530
@@ -2,6 +2,7 @@
* Copyright (C) 2016 x265 project
*
* Authors: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
+ * Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -698,6 +699,7 @@
bgt .loop_filterP2S_48x64
bx lr
endfunc
+
//**************luma_vpp************
.macro LUMA_VPP_4xN h
function x265_interp_8tap_vert_pp_4x\h\()_neon
@@ -2474,3 +2476,163 @@
CHROMA_VSP 64 64
CHROMA_VSP 48 64
+ // void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
+.macro vextin8
+ pld [r5]
+ vld1.8 {q3}, [r5]!
+ vext.8 d0, d6, d7, #1
+ vext.8 d1, d6, d7, #2
+ vext.8 d2, d6, d7, #3
+ vext.8 d3, d6, d7, #4
+ vext.8 d4, d6, d7, #5
+ vext.8 d5, d6, d7, #6
+ vext.8 d6, d6, d7, #7
+.endm
+
+.macro HPP_FILTER a b filterhpp
+ mov r12,#32
+ mov r6, #\b
+ sub r3, #\a
+ mov r8, #\a
+ cmp r8, #4
+ beq 4f
+ cmp r8, #12
+ beq 12f
+ b 6f
+4:
+ HPP_FILTER_4 \a \b \filterhpp
+ b 5f
+12:
+ HPP_FILTER_12 \a \b \filterhpp
+ b 5f
+6:
+loop2_hpp_\filterhpp\()_\a\()x\b:
+ mov r7, #\a
+ lsr r7, #3
+ mov r5, r0
+ sub r5, #4
+loop3_hpp_\filterhpp\()_\a\()x\b:
+ vextin8
+ \filterhpp
+ vdup.32 q8, r12
+ vadd.s32 q9, q8
+ vqshrun.s32 d0, q9, #6
+ vadd.s32 q10, q8
+ vqshrun.s32 d1, q10, #6
+ vqmovn.u16 d0, q0
+ vst1.u8 d0, [r2]!
+ subs r7, #1
+ sub r5, #8
+ bne loop3_hpp_\filterhpp\()_\a\()x\b
+ subs r6, #1
+ add r0, r1
+ add r2, r3
+ bne loop2_hpp_\filterhpp\()_\a\()x\b
+5:
+.endm
+
+.macro HPP_FILTER_4 w h filterhpp
+loop4_hpp_\filterhpp\()_\w\()x\h:
+ mov r5, r0
+ sub r5, #4
+ vextin8
+ \filterhpp
+ vdup.32 q8, r12
+ vadd.s32 q9, q8
+ vqshrun.s32 d0, q9, #6
+ vadd.s32 q10, q8
+ vqshrun.s32 d1, q10, #6
+ vqmovn.u16 d0, q0
+ vst1.u32 {d0[0]}, [r2]!
+ sub r5, #8
+ subs r6, #1
+ add r0, r1
+ add r2, r3
+ bne loop4_hpp_\filterhpp\()_\w\()x\h
+.endm
+
+.macro HPP_FILTER_12 w h filterhpp
+loop12_hpp_\filterhpp\()_\w\()x\h:
+ mov r5, r0
+ sub r5, #4
+ vextin8
+ \filterhpp
+ vdup.32 q8, r12
+ vadd.s32 q9, q8
+ vqshrun.s32 d0, q9, #6
+ vadd.s32 q10, q8
+ vqshrun.s32 d1, q10, #6
+ vqmovn.u16 d0, q0
+ vst1.u8 {d0}, [r2]!
+ sub r5, #8
+
+ vextin8
+ \filterhpp
+ vdup.32 q8, r12
+ vadd.s32 q9, q8
+ vqshrun.s32 d0, q9, #6
+ vadd.s32 q10, q8
+ vqshrun.s32 d1, q10, #6
+ vqmovn.u16 d0, q0
+ vst1.u32 {d0[0]}, [r2]!
+ add r2, r3
+ subs r6, #1
+ add r0, r1
+ bne loop12_hpp_\filterhpp\()_\w\()x\h
+.endm
+
+.macro LUMA_HPP w h
+function x265_interp_horiz_pp_\w\()x\h\()_neon
+ push {r4, r5, r6, r7, r8}
+ ldr r4, [sp, #20]
+ cmp r4, #0
+ beq 0f
+ cmp r4, #1
+ beq 1f
+ cmp r4, #2
+ beq 2f
+ cmp r4, #3
+ beq 3f
+0:
+ HPP_FILTER \w \h qpel_filter_0_32b
+ b 5f
+1:
+ HPP_FILTER \w \h qpel_filter_1_32b
+ b 5f
+2:
+ HPP_FILTER \w \h qpel_filter_2_32b
+ b 5f
+3:
+ HPP_FILTER \w \h qpel_filter_3_32b
+ b 5f
+5:
+ pop {r4, r5, r6, r7, r8}
+ bx lr
+endfunc
+.endm
+
+LUMA_HPP 4 4
+LUMA_HPP 4 8
+LUMA_HPP 4 16
+LUMA_HPP 8 4
+LUMA_HPP 8 8
+LUMA_HPP 8 16
+LUMA_HPP 8 32
+LUMA_HPP 12 16
+LUMA_HPP 16 4
+LUMA_HPP 16 8
+LUMA_HPP 16 12
+LUMA_HPP 16 16
+LUMA_HPP 16 32
+LUMA_HPP 16 64
+LUMA_HPP 24 32
+LUMA_HPP 32 8
+LUMA_HPP 32 16
+LUMA_HPP 32 24
+LUMA_HPP 32 32
+LUMA_HPP 32 64
+LUMA_HPP 48 64
+LUMA_HPP 64 16
+LUMA_HPP 64 32
+LUMA_HPP 64 48
+LUMA_HPP 64 64
diff -r 331fcf0d9d1e -r 960ecc63c686 source/common/arm/ipfilter8.h
--- a/source/common/arm/ipfilter8.h Thu Mar 24 15:25:37 2016 +0530
+++ b/source/common/arm/ipfilter8.h Tue Mar 15 14:33:08 2016 +0530
@@ -216,4 +216,29 @@
void x265_interp_4tap_vert_sp_64x64_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
void x265_interp_4tap_vert_sp_64x48_neon(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_4x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_4x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_4x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_8x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_8x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_8x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_8x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_12x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_16x4_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_16x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_16x12_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_16x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_16x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_16x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_24x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_32x8_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_32x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_32x24_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_32x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_32x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_48x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_64x16_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_horiz_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
#endif // ifndef X265_IPFILTER8_ARM_H
More information about the x265-devel
mailing list