[x265] [PATCH] arm :Implement interp_8tap_horiz_ps ARM NEON

radhakrishnan at multicorewareinc.com radhakrishnan at multicorewareinc.com
Tue Mar 29 09:15:20 CEST 2016


# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1459231997 -19800
#      Tue Mar 29 11:43:17 2016 +0530
# Node ID 14ffbe7738e5bfbe9a0f19328f00f1d8821922f8
# Parent  960ecc63c686c40e5ed3302ff238a8fdd51cc854
arm :Implement interp_8tap_horiz_ps ARM NEON

diff -r 960ecc63c686 -r 14ffbe7738e5 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp	Tue Mar 15 14:33:08 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Tue Mar 29 11:43:17 2016 +0530
@@ -70,6 +70,33 @@
         p.pu[LUMA_64x48].luma_hpp = PFX(interp_horiz_pp_64x48_neon);
         p.pu[LUMA_64x64].luma_hpp = PFX(interp_horiz_pp_64x64_neon);
 
+        // luma_hps
+        p.pu[LUMA_4x4].luma_hps   = PFX(interp_horiz_ps_4x4_neon);
+        p.pu[LUMA_4x8].luma_hps   = PFX(interp_horiz_ps_4x8_neon);
+        p.pu[LUMA_4x16].luma_hps  = PFX(interp_horiz_ps_4x16_neon);
+        p.pu[LUMA_8x4].luma_hps   = PFX(interp_horiz_ps_8x4_neon);
+        p.pu[LUMA_8x8].luma_hps   = PFX(interp_horiz_ps_8x8_neon);
+        p.pu[LUMA_8x16].luma_hps  = PFX(interp_horiz_ps_8x16_neon);
+        p.pu[LUMA_8x32].luma_hps  = PFX(interp_horiz_ps_8x32_neon);
+        p.pu[LUMA_12x16].luma_hps = PFX(interp_horiz_ps_12x16_neon);
+        p.pu[LUMA_16x4].luma_hps  = PFX(interp_horiz_ps_16x4_neon);
+        p.pu[LUMA_16x8].luma_hps  = PFX(interp_horiz_ps_16x8_neon);
+        p.pu[LUMA_16x12].luma_hps = PFX(interp_horiz_ps_16x12_neon);
+        p.pu[LUMA_16x16].luma_hps = PFX(interp_horiz_ps_16x16_neon);
+        p.pu[LUMA_16x32].luma_hps = PFX(interp_horiz_ps_16x32_neon);
+        p.pu[LUMA_16x64].luma_hps = PFX(interp_horiz_ps_16x64_neon);
+        p.pu[LUMA_24x32].luma_hps = PFX(interp_horiz_ps_24x32_neon);
+        p.pu[LUMA_32x8].luma_hps  = PFX(interp_horiz_ps_32x8_neon);
+        p.pu[LUMA_32x16].luma_hps = PFX(interp_horiz_ps_32x16_neon);
+        p.pu[LUMA_32x24].luma_hps = PFX(interp_horiz_ps_32x24_neon);
+        p.pu[LUMA_32x32].luma_hps = PFX(interp_horiz_ps_32x32_neon);
+        p.pu[LUMA_32x64].luma_hps = PFX(interp_horiz_ps_32x64_neon);
+        p.pu[LUMA_48x64].luma_hps = PFX(interp_horiz_ps_48x64_neon);
+        p.pu[LUMA_64x16].luma_hps = PFX(interp_horiz_ps_64x16_neon);
+        p.pu[LUMA_64x32].luma_hps = PFX(interp_horiz_ps_64x32_neon);
+        p.pu[LUMA_64x48].luma_hps = PFX(interp_horiz_ps_64x48_neon);
+        p.pu[LUMA_64x64].luma_hps = PFX(interp_horiz_ps_64x64_neon);
+
         // count nonzero
         p.cu[BLOCK_4x4].count_nonzero     = PFX(count_nonzero_4_neon);
         p.cu[BLOCK_8x8].count_nonzero     = PFX(count_nonzero_8_neon);
diff -r 960ecc63c686 -r 14ffbe7738e5 source/common/arm/ipfilter8.S
--- a/source/common/arm/ipfilter8.S	Tue Mar 15 14:33:08 2016 +0530
+++ b/source/common/arm/ipfilter8.S	Tue Mar 29 11:43:17 2016 +0530
@@ -2636,3 +2636,235 @@
 LUMA_HPP    64 32
 LUMA_HPP    64 48
 LUMA_HPP    64 64
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro HPS_FILTER a b filterhps
+    mov             r12, #8192
+    mov             r6, r10
+    sub             r3, #\a
+    lsl             r3, #1
+
+    mov             r8, #\a
+    cmp             r8, #4
+    beq             14f
+    cmp             r8, #12
+    beq             15f
+    b               7f
+14:
+    HPS_FILTER_4 \a \b \filterhps
+    b               10f
+15:
+    HPS_FILTER_12 \a \b \filterhps
+    b               10f
+7:
+    cmp             r9, #0
+    beq             8f
+    cmp             r9, #1
+    beq             9f
+8:
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
+    mov             r7, #\a
+    lsr             r7, #3
+    mov             r5, r0
+    sub             r5, #4
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vsub.s32        q10, q8
+    vmovn.u32       d0, q9
+    vmovn.u32       d1, q10
+    vst1.s16        {q0}, [r2]!
+    subs            r7, #1
+    sub             r5, #8
+    bne             loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
+    subs            r6, #1
+    add             r0, r1
+    add             r2, r3
+    bne             loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
+    b               10f
+9:
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
+    mov             r7, #\a
+    lsr             r7, #3
+    mov             r5, r0
+    sub             r5, #4
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vsub.s32        q10, q8
+    vmovn.u32       d0, q9
+    vmovn.u32       d1, q10
+    vst1.s16        {q0}, [r2]!
+    subs            r7, #1
+    sub             r5, #8
+    bne             loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
+    subs            r6, #1
+    add             r0, r1
+    add             r2, r3
+    bne             loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
+10:
+.endm
+
+.macro HPS_FILTER_4 w h filterhps
+    cmp             r9, #0
+    beq             11f
+    cmp             r9, #1
+    beq             12f
+11:
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
+    mov             r5, r0
+    sub             r5, #4
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vmovn.u32       d0, q9
+    vst1.s16        {d0}, [r2]!
+    sub             r5, #8
+    subs            r6, #1
+    add             r0, r1
+    add             r2, r3
+    bne             loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
+    b               13f
+12:
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
+    mov             r5, r0
+    sub             r5, #4
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vmovn.u32       d0, q9
+    vst1.s16        {d0}, [r2]!
+    sub             r5, #8
+    subs            r6, #1
+    add             r0, r1
+    add             r2, r3
+    bne             loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
+13:
+.endm
+
+.macro HPS_FILTER_12 w h filterhps
+    cmp             r9, #0
+    beq             14f
+    cmp             r9, #1
+    beq             15f
+14:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
+    mov             r5, r0
+    sub             r5, #4
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vsub.s32        q10, q8
+    vmovn.u32       d0, q9
+    vmovn.u32       d1, q10
+    vst1.s16        {q0}, [r2]!
+    sub             r5, #8
+
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vmovn.u32       d0, q9
+    vst1.s16        {d0}, [r2]!
+    add             r2, r3
+    subs            r6, #1
+    add             r0, r1
+    bne             loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
+    b               16f
+15:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
+    mov             r5, r0
+    sub             r5, #4
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vsub.s32        q10, q8
+    vmovn.u32       d0, q9
+    vmovn.u32       d1, q10
+    vst1.s16        {q0}, [r2]!
+    sub             r5, #8
+
+    vextin8
+    \filterhps
+    vdup.32         q8, r12
+    vsub.s32        q9, q8
+    vmovn.u32       d0, q9
+    vst1.s16        {d0}, [r2]!
+    add             r2, r3
+    subs            r6, #1
+    add             r0, r1
+    bne             loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
+16:
+.endm
+
+.macro LUMA_HPS w h
+function x265_interp_horiz_ps_\w\()x\h\()_neon
+    push            {r4, r5, r6, r7, r8, r9, r10}
+    ldr             r4, [sp, #28]
+    ldr             r9, [sp, #32]
+    mov             r10, #\h
+    cmp             r9, #0
+    beq             6f
+    sub             r0, r0, r1, lsl #2
+    add             r0, r1
+    add             r10, #7
+6:
+    cmp             r4, #0
+    beq             0f
+    cmp             r4, #1
+    beq             1f
+    cmp             r4, #2
+    beq             2f
+    cmp             r4, #3
+    beq             3f
+0:
+    HPS_FILTER  \w \h qpel_filter_0_32b
+    b               5f
+1:
+    HPS_FILTER  \w \h qpel_filter_1_32b
+    b               5f
+2:
+    HPS_FILTER  \w \h qpel_filter_2_32b
+    b               5f
+3:
+    HPS_FILTER  \w \h qpel_filter_3_32b
+    b               5f
+5:
+    pop             {r4, r5, r6, r7, r8, r9, r10}
+    bx              lr
+endfunc
+.endm
+
+LUMA_HPS    4 4
+LUMA_HPS    4 8
+LUMA_HPS    4 16
+LUMA_HPS    8 4
+LUMA_HPS    8 8
+LUMA_HPS    8 16
+LUMA_HPS    8 32
+LUMA_HPS    12 16
+LUMA_HPS    16 4
+LUMA_HPS    16 8
+LUMA_HPS    16 12
+LUMA_HPS    16 16
+LUMA_HPS    16 32
+LUMA_HPS    16 64
+LUMA_HPS    24 32
+LUMA_HPS    32 8
+LUMA_HPS    32 16
+LUMA_HPS    32 24
+LUMA_HPS    32 32
+LUMA_HPS    32 64
+LUMA_HPS    48 64
+LUMA_HPS    64 16
+LUMA_HPS    64 32
+LUMA_HPS    64 48
+LUMA_HPS    64 64
diff -r 960ecc63c686 -r 14ffbe7738e5 source/common/arm/ipfilter8.h
--- a/source/common/arm/ipfilter8.h	Tue Mar 15 14:33:08 2016 +0530
+++ b/source/common/arm/ipfilter8.h	Tue Mar 29 11:43:17 2016 +0530
@@ -241,4 +241,30 @@
 void x265_interp_horiz_pp_64x32_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_horiz_pp_64x48_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_horiz_pp_64x64_neon(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+
+void x265_interp_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
 #endif // ifndef X265_IPFILTER8_ARM_H


More information about the x265-devel mailing list