[x265] [PATCH] arm: Implement blockcopy_pp chroma ARM NEON

chen chenm003 at 163.com
Wed Apr 20 18:08:15 CEST 2016


no big problem in this patchbut please remember, the NEON is CP unit, the ARM core just buffer 10 instructions to NEON pipeline, so big size loop unroll may block pipeline some cycles.
At 2016-04-20 19:14:57,radhakrishnan at multicorewareinc.com wrote:
># HG changeset patch
># User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
># Date 1461046077 -19800
>#      Tue Apr 19 11:37:57 2016 +0530
># Node ID 0d38844bf4b3632444fc0249a549a1e0e3e2bfc8
># Parent  534b8e2845b8156010b3c79bfa88c81c7b0b9295
>arm: Implement blockcopy_pp chroma ARM NEON
>
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp	Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp	Tue Apr 19 11:37:57 2016 +0530
>@@ -462,6 +462,56 @@
>         p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
>         p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
> 
>+        // chroma blockcopy
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].copy_pp   = PFX(blockcopy_pp_2x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].copy_pp   = PFX(blockcopy_pp_2x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].copy_pp   = PFX(blockcopy_pp_4x2_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].copy_pp   = PFX(blockcopy_pp_6x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].copy_pp   = PFX(blockcopy_pp_8x2_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].copy_pp   = PFX(blockcopy_pp_8x6_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].copy_pp = PFX(blockcopy_pp_12x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp  = PFX(blockcopy_pp_16x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = PFX(blockcopy_pp_16x12_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = PFX(blockcopy_pp_24x32_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp  = PFX(blockcopy_pp_32x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
>+
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].copy_pp  = PFX(blockcopy_pp_2x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].copy_pp  = PFX(blockcopy_pp_4x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].copy_pp  = PFX(blockcopy_pp_6x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].copy_pp  = PFX(blockcopy_pp_8x12_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].copy_pp  = PFX(blockcopy_pp_8x64_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].copy_pp = PFX(blockcopy_pp_12x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = PFX(blockcopy_pp_16x24_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].copy_pp = PFX(blockcopy_pp_16x64_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = PFX(blockcopy_pp_24x64_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
>+
>         // sad
>         p.pu[LUMA_8x4].sad    = PFX(pixel_sad_8x4_neon);
>         p.pu[LUMA_8x8].sad    = PFX(pixel_sad_8x8_neon);
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/blockcopy8.h
>--- a/source/common/arm/blockcopy8.h	Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/blockcopy8.h	Tue Apr 19 11:37:57 2016 +0530
>@@ -51,6 +51,21 @@
> void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> 
> void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/mc-a.S
>--- a/source/common/arm/mc-a.S	Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/mc-a.S	Tue Apr 19 11:37:57 2016 +0530
>@@ -61,6 +61,8 @@
> blockcopy_pp_4xN_neon 4
> blockcopy_pp_4xN_neon 8
> blockcopy_pp_4xN_neon 16
>+blockcopy_pp_4xN_neon 2
>+blockcopy_pp_4xN_neon 32
> 
> .macro blockcopy_pp_16xN_neon h
> function x265_blockcopy_pp_16x\h\()_neon
>@@ -75,6 +77,7 @@
> blockcopy_pp_16xN_neon 4
> blockcopy_pp_16xN_neon 8
> blockcopy_pp_16xN_neon 12
>+blockcopy_pp_16xN_neon 24
> 
> .macro blockcopy_pp_16xN1_neon h i
> function x265_blockcopy_pp_16x\h\()_neon
>@@ -109,6 +112,9 @@
> blockcopy_pp_8xN_neon 8
> blockcopy_pp_8xN_neon 16
> blockcopy_pp_8xN_neon 32
>+blockcopy_pp_8xN_neon 2
>+blockcopy_pp_8xN_neon 6
>+blockcopy_pp_8xN_neon 12
> 
> function x265_blockcopy_pp_12x16_neon
>     push            {r4, r5}
>@@ -167,6 +173,7 @@
> blockcopy_pp_32xN_neon 24 3
> blockcopy_pp_32xN_neon 32 4
> blockcopy_pp_32xN_neon 64 8
>+blockcopy_pp_32xN_neon 48 6
> 
> function x265_blockcopy_pp_48x64_neon
>     push            {r4, r5}
>@@ -213,6 +220,81 @@
> blockcopy_pp_64xN_neon 48 12
> blockcopy_pp_64xN_neon 64 16
> 
>+.macro blockcopy_pp_2xN_neon h
>+function x265_blockcopy_pp_2x\h\()_neon
>+.rept \h
>+    ldrh            r12, [r2], r3
>+    strh            r12, [r0], r1
>+.endr
>+    bx              lr
>+endfunc
>+.endm
>+
>+blockcopy_pp_2xN_neon 4
>+blockcopy_pp_2xN_neon 8
>+blockcopy_pp_2xN_neon 16
>+
>+.macro blockcopy_pp_6xN_neon h i
>+function x265_blockcopy_pp_6x\h\()_neon
>+    sub             r1, #4
>+.rept \i
>+    vld1.8          {d0}, [r2], r3
>+    vld1.8          {d1}, [r2], r3
>+    vst1.32         {d0[0]}, [r0]!
>+    vst1.16         {d0[2]}, [r0], r1
>+    vst1.32         {d1[0]}, [r0]!
>+    vst1.16         {d1[2]}, [r0], r1
>+.endr
>+    bx              lr
>+endfunc
>+.endm
>+blockcopy_pp_6xN_neon 8 4
>+blockcopy_pp_6xN_neon 16 8
>+
>+function x265_blockcopy_pp_8x64_neon
>+    mov             r12, #4
>+loop_pp_8x64:
>+    subs            r12, #1
>+.rept 16
>+    vld1.8          {d0}, [r2], r3
>+    vst1.8          {d0}, [r0], r1
>+.endr
>+    bne             loop_pp_8x64
>+    bx              lr
>+endfunc
>+
>+function x265_blockcopy_pp_12x32_neon
>+    push            {r4}
>+    sub             r3, #8
>+    sub             r1, #8
>+    mov             r12, #4
>+loop_pp_12x32:
>+    subs            r12, #1
>+.rept 8
>+    vld1.8          {d0}, [r2]!
>+    ldr             r4, [r2], r3
>+    vst1.8          {d0}, [r0]!
>+    str             r4, [r0], r1
>+.endr
>+    bne             loop_pp_12x32
>+    pop            {r4}
>+    bx              lr
>+endfunc
>+
>+function x265_blockcopy_pp_24x64_neon
>+    push            {r4}
>+    mov             r4, #4
>+loop_24x64:
>+.rept 16
>+    vld1.8          {d0, d1, d2}, [r2], r3
>+    vst1.8          {d0, d1, d2}, [r0], r1
>+.endr
>+    subs            r4, r4, #1
>+    bne             loop_24x64
>+    pop             {r4}
>+    bx              lr
>+endfunc
>+
> // void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
> .macro pixel_avg_pp_4xN_neon h
> function x265_pixel_avg_pp_4x\h\()_neon
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160421/f8c538c9/attachment-0001.html>


More information about the x265-devel mailing list