[x265] [PATCH] arm: Implement blockcopy_pp chroma ARM NEON

radhakrishnan at multicorewareinc.com radhakrishnan at multicorewareinc.com
Wed Apr 20 13:14:57 CEST 2016


# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1461046077 -19800
#      Tue Apr 19 11:37:57 2016 +0530
# Node ID 0d38844bf4b3632444fc0249a549a1e0e3e2bfc8
# Parent  534b8e2845b8156010b3c79bfa88c81c7b0b9295
arm: Implement blockcopy_pp chroma ARM NEON

diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp	Fri Apr 15 16:44:32 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Tue Apr 19 11:37:57 2016 +0530
@@ -462,6 +462,56 @@
         p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
         p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
 
+        // chroma blockcopy
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].copy_pp   = PFX(blockcopy_pp_2x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].copy_pp   = PFX(blockcopy_pp_2x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].copy_pp   = PFX(blockcopy_pp_4x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].copy_pp   = PFX(blockcopy_pp_6x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].copy_pp   = PFX(blockcopy_pp_8x2_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].copy_pp   = PFX(blockcopy_pp_8x6_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].copy_pp = PFX(blockcopy_pp_12x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp  = PFX(blockcopy_pp_16x4_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = PFX(blockcopy_pp_16x12_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = PFX(blockcopy_pp_24x32_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp  = PFX(blockcopy_pp_32x8_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_neon);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
+
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].copy_pp  = PFX(blockcopy_pp_2x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].copy_pp   = PFX(blockcopy_pp_4x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].copy_pp   = PFX(blockcopy_pp_4x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].copy_pp  = PFX(blockcopy_pp_4x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].copy_pp  = PFX(blockcopy_pp_4x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].copy_pp  = PFX(blockcopy_pp_6x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].copy_pp   = PFX(blockcopy_pp_8x4_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].copy_pp   = PFX(blockcopy_pp_8x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].copy_pp  = PFX(blockcopy_pp_8x12_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].copy_pp  = PFX(blockcopy_pp_8x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].copy_pp  = PFX(blockcopy_pp_8x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].copy_pp  = PFX(blockcopy_pp_8x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].copy_pp = PFX(blockcopy_pp_12x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].copy_pp  = PFX(blockcopy_pp_16x8_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = PFX(blockcopy_pp_16x24_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].copy_pp = PFX(blockcopy_pp_16x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = PFX(blockcopy_pp_24x64_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_neon);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
+
         // sad
         p.pu[LUMA_8x4].sad    = PFX(pixel_sad_8x4_neon);
         p.pu[LUMA_8x8].sad    = PFX(pixel_sad_8x8_neon);
diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h	Fri Apr 15 16:44:32 2016 +0530
+++ b/source/common/arm/blockcopy8.h	Tue Apr 19 11:37:57 2016 +0530
@@ -51,6 +51,21 @@
 void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
 
 void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/mc-a.S
--- a/source/common/arm/mc-a.S	Fri Apr 15 16:44:32 2016 +0530
+++ b/source/common/arm/mc-a.S	Tue Apr 19 11:37:57 2016 +0530
@@ -61,6 +61,8 @@
 blockcopy_pp_4xN_neon 4
 blockcopy_pp_4xN_neon 8
 blockcopy_pp_4xN_neon 16
+blockcopy_pp_4xN_neon 2
+blockcopy_pp_4xN_neon 32
 
 .macro blockcopy_pp_16xN_neon h
 function x265_blockcopy_pp_16x\h\()_neon
@@ -75,6 +77,7 @@
 blockcopy_pp_16xN_neon 4
 blockcopy_pp_16xN_neon 8
 blockcopy_pp_16xN_neon 12
+blockcopy_pp_16xN_neon 24
 
 .macro blockcopy_pp_16xN1_neon h i
 function x265_blockcopy_pp_16x\h\()_neon
@@ -109,6 +112,9 @@
 blockcopy_pp_8xN_neon 8
 blockcopy_pp_8xN_neon 16
 blockcopy_pp_8xN_neon 32
+blockcopy_pp_8xN_neon 2
+blockcopy_pp_8xN_neon 6
+blockcopy_pp_8xN_neon 12
 
 function x265_blockcopy_pp_12x16_neon
     push            {r4, r5}
@@ -167,6 +173,7 @@
 blockcopy_pp_32xN_neon 24 3
 blockcopy_pp_32xN_neon 32 4
 blockcopy_pp_32xN_neon 64 8
+blockcopy_pp_32xN_neon 48 6
 
 function x265_blockcopy_pp_48x64_neon
     push            {r4, r5}
@@ -213,6 +220,81 @@
 blockcopy_pp_64xN_neon 48 12
 blockcopy_pp_64xN_neon 64 16
 
+.macro blockcopy_pp_2xN_neon h
+function x265_blockcopy_pp_2x\h\()_neon
+.rept \h
+    ldrh            r12, [r2], r3
+    strh            r12, [r0], r1
+.endr
+    bx              lr
+endfunc
+.endm
+
+blockcopy_pp_2xN_neon 4
+blockcopy_pp_2xN_neon 8
+blockcopy_pp_2xN_neon 16
+
+.macro blockcopy_pp_6xN_neon h i
+function x265_blockcopy_pp_6x\h\()_neon
+    sub             r1, #4
+.rept \i
+    vld1.8          {d0}, [r2], r3
+    vld1.8          {d1}, [r2], r3
+    vst1.32         {d0[0]}, [r0]!
+    vst1.16         {d0[2]}, [r0], r1
+    vst1.32         {d1[0]}, [r0]!
+    vst1.16         {d1[2]}, [r0], r1
+.endr
+    bx              lr
+endfunc
+.endm
+blockcopy_pp_6xN_neon 8 4
+blockcopy_pp_6xN_neon 16 8
+
+function x265_blockcopy_pp_8x64_neon
+    mov             r12, #4
+loop_pp_8x64:
+    subs            r12, #1
+.rept 16
+    vld1.8          {d0}, [r2], r3
+    vst1.8          {d0}, [r0], r1
+.endr
+    bne             loop_pp_8x64
+    bx              lr
+endfunc
+
+function x265_blockcopy_pp_12x32_neon
+    push            {r4}
+    sub             r3, #8
+    sub             r1, #8
+    mov             r12, #4
+loop_pp_12x32:
+    subs            r12, #1
+.rept 8
+    vld1.8          {d0}, [r2]!
+    ldr             r4, [r2], r3
+    vst1.8          {d0}, [r0]!
+    str             r4, [r0], r1
+.endr
+    bne             loop_pp_12x32
+    pop            {r4}
+    bx              lr
+endfunc
+
+function x265_blockcopy_pp_24x64_neon
+    push            {r4}
+    mov             r4, #4
+loop_24x64:
+.rept 16
+    vld1.8          {d0, d1, d2}, [r2], r3
+    vst1.8          {d0, d1, d2}, [r0], r1
+.endr
+    subs            r4, r4, #1
+    bne             loop_24x64
+    pop             {r4}
+    bx              lr
+endfunc
+
 // void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
 .macro pixel_avg_pp_4xN_neon h
 function x265_pixel_avg_pp_4x\h\()_neon


More information about the x265-devel mailing list