<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><pre>no big problem in this patch</pre><pre>but please remember, the NEON is CP unit, the ARM core just buffer 10 instructions to NEON pipeline, so big size loop unroll may block pipeline some cycles.</pre><pre><br>At 2016-04-20 19:14:57,radhakrishnan@multicorewareinc.com wrote:
># HG changeset patch
># User Radhakrishnan VR <radhakrishnan@multicorewareinc.com>
># Date 1461046077 -19800
>#      Tue Apr 19 11:37:57 2016 +0530
># Node ID 0d38844bf4b3632444fc0249a549a1e0e3e2bfc8
># Parent  534b8e2845b8156010b3c79bfa88c81c7b0b9295
>arm: Implement blockcopy_pp chroma ARM NEON
>
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp     Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp     Tue Apr 19 11:37:57 2016 +0530
>@@ -462,6 +462,56 @@
>         p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
>         p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);

>+        // chroma blockcopy
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].copy_pp   = PFX(blockcopy_pp_2x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].copy_pp   = PFX(blockcopy_pp_2x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].copy_pp   = PFX(blockcopy_pp_4x2_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].copy_pp   = PFX(blockcopy_pp_6x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].copy_pp   = PFX(blockcopy_pp_8x2_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].copy_pp   = PFX(blockcopy_pp_8x6_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].copy_pp = PFX(blockcopy_pp_12x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp  = PFX(blockcopy_pp_16x4_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = PFX(blockcopy_pp_16x12_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = PFX(blockcopy_pp_24x32_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp  = PFX(blockcopy_pp_32x8_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_neon);
>+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
>+
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].copy_pp  = PFX(blockcopy_pp_2x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].copy_pp  = PFX(blockcopy_pp_4x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].copy_pp  = PFX(blockcopy_pp_6x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].copy_pp  = PFX(blockcopy_pp_8x12_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].copy_pp  = PFX(blockcopy_pp_8x64_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].copy_pp = PFX(blockcopy_pp_12x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = PFX(blockcopy_pp_16x24_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].copy_pp = PFX(blockcopy_pp_16x64_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = PFX(blockcopy_pp_24x64_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_neon);
>+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
>+
>         // sad
>         p.pu[LUMA_8x4].sad    = PFX(pixel_sad_8x4_neon);
>         p.pu[LUMA_8x8].sad    = PFX(pixel_sad_8x8_neon);
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/blockcopy8.h
>--- a/source/common/arm/blockcopy8.h   Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/blockcopy8.h   Tue Apr 19 11:37:57 2016 +0530
>@@ -51,6 +51,21 @@
> void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);

> void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/mc-a.S
>--- a/source/common/arm/mc-a.S Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/mc-a.S Tue Apr 19 11:37:57 2016 +0530
>@@ -61,6 +61,8 @@
> blockcopy_pp_4xN_neon 4
> blockcopy_pp_4xN_neon 8
> blockcopy_pp_4xN_neon 16
>+blockcopy_pp_4xN_neon 2
>+blockcopy_pp_4xN_neon 32

> .macro blockcopy_pp_16xN_neon h
> function x265_blockcopy_pp_16x\h\()_neon
>@@ -75,6 +77,7 @@
> blockcopy_pp_16xN_neon 4
> blockcopy_pp_16xN_neon 8
> blockcopy_pp_16xN_neon 12
>+blockcopy_pp_16xN_neon 24

> .macro blockcopy_pp_16xN1_neon h i
> function x265_blockcopy_pp_16x\h\()_neon
>@@ -109,6 +112,9 @@
> blockcopy_pp_8xN_neon 8
> blockcopy_pp_8xN_neon 16
> blockcopy_pp_8xN_neon 32
>+blockcopy_pp_8xN_neon 2
>+blockcopy_pp_8xN_neon 6
>+blockcopy_pp_8xN_neon 12

> function x265_blockcopy_pp_12x16_neon
>     push            {r4, r5}
>@@ -167,6 +173,7 @@
> blockcopy_pp_32xN_neon 24 3
> blockcopy_pp_32xN_neon 32 4
> blockcopy_pp_32xN_neon 64 8
>+blockcopy_pp_32xN_neon 48 6

> function x265_blockcopy_pp_48x64_neon
>     push            {r4, r5}
>@@ -213,6 +220,81 @@
> blockcopy_pp_64xN_neon 48 12
> blockcopy_pp_64xN_neon 64 16

>+.macro blockcopy_pp_2xN_neon h
>+function x265_blockcopy_pp_2x\h\()_neon
>+.rept \h
>+    ldrh            r12, [r2], r3
>+    strh            r12, [r0], r1
>+.endr
>+    bx              lr
>+endfunc
>+.endm
>+
>+blockcopy_pp_2xN_neon 4
>+blockcopy_pp_2xN_neon 8
>+blockcopy_pp_2xN_neon 16
>+
>+.macro blockcopy_pp_6xN_neon h i
>+function x265_blockcopy_pp_6x\h\()_neon
>+    sub             r1, #4
>+.rept \i
>+    vld1.8          {d0}, [r2], r3
>+    vld1.8          {d1}, [r2], r3
>+    vst1.32         {d0[0]}, [r0]!
>+    vst1.16         {d0[2]}, [r0], r1
>+    vst1.32         {d1[0]}, [r0]!
>+    vst1.16         {d1[2]}, [r0], r1
>+.endr
>+    bx              lr
>+endfunc
>+.endm
>+blockcopy_pp_6xN_neon 8 4
>+blockcopy_pp_6xN_neon 16 8
>+
>+function x265_blockcopy_pp_8x64_neon
>+    mov             r12, #4
>+loop_pp_8x64:
>+    subs            r12, #1
>+.rept 16
>+    vld1.8          {d0}, [r2], r3
>+    vst1.8          {d0}, [r0], r1
>+.endr
>+    bne             loop_pp_8x64
>+    bx              lr
>+endfunc
>+
>+function x265_blockcopy_pp_12x32_neon
>+    push            {r4}
>+    sub             r3, #8
>+    sub             r1, #8
>+    mov             r12, #4
>+loop_pp_12x32:
>+    subs            r12, #1
>+.rept 8
>+    vld1.8          {d0}, [r2]!
>+    ldr             r4, [r2], r3
>+    vst1.8          {d0}, [r0]!
>+    str             r4, [r0], r1
>+.endr
>+    bne             loop_pp_12x32
>+    pop            {r4}
>+    bx              lr
>+endfunc
>+
>+function x265_blockcopy_pp_24x64_neon
>+    push            {r4}
>+    mov             r4, #4
>+loop_24x64:
>+.rept 16
>+    vld1.8          {d0, d1, d2}, [r2], r3
>+    vst1.8          {d0, d1, d2}, [r0], r1
>+.endr
>+    subs            r4, r4, #1
>+    bne             loop_24x64
>+    pop             {r4}
>+    bx              lr
>+endfunc
>+
> // void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
> .macro pixel_avg_pp_4xN_neon h
> function x265_pixel_avg_pp_4x\h\()_neon
>_______________________________________________
>x265-devel mailing list
>x265-devel@videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
</pre></div>