[x265] [PATCH] arm: Implement blockcopy_pp chroma ARM NEON
chen
chenm003 at 163.com
Wed Apr 20 18:08:15 CEST 2016
no big problem in this patchbut please remember, the NEON is CP unit, the ARM core just buffer 10 instructions to NEON pipeline, so big size loop unroll may block pipeline some cycles.
At 2016-04-20 19:14:57,radhakrishnan at multicorewareinc.com wrote:
># HG changeset patch
># User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
># Date 1461046077 -19800
># Tue Apr 19 11:37:57 2016 +0530
># Node ID 0d38844bf4b3632444fc0249a549a1e0e3e2bfc8
># Parent 534b8e2845b8156010b3c79bfa88c81c7b0b9295
>arm: Implement blockcopy_pp chroma ARM NEON
>
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp Tue Apr 19 11:37:57 2016 +0530
>@@ -462,6 +462,56 @@
> p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
> p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
>
>+ // chroma blockcopy
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].copy_pp = PFX(blockcopy_pp_2x4_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].copy_pp = PFX(blockcopy_pp_2x8_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].copy_pp = PFX(blockcopy_pp_4x2_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].copy_pp = PFX(blockcopy_pp_4x16_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].copy_pp = PFX(blockcopy_pp_6x8_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].copy_pp = PFX(blockcopy_pp_8x2_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].copy_pp = PFX(blockcopy_pp_8x4_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].copy_pp = PFX(blockcopy_pp_8x6_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].copy_pp = PFX(blockcopy_pp_8x32_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].copy_pp = PFX(blockcopy_pp_12x16_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp = PFX(blockcopy_pp_16x4_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp = PFX(blockcopy_pp_16x8_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = PFX(blockcopy_pp_16x12_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = PFX(blockcopy_pp_24x32_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = PFX(blockcopy_pp_32x8_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_neon);
>+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
>+
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].copy_pp = PFX(blockcopy_pp_2x16_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].copy_pp = PFX(blockcopy_pp_4x16_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].copy_pp = PFX(blockcopy_pp_4x32_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].copy_pp = PFX(blockcopy_pp_6x16_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].copy_pp = PFX(blockcopy_pp_8x4_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].copy_pp = PFX(blockcopy_pp_8x12_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].copy_pp = PFX(blockcopy_pp_8x32_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].copy_pp = PFX(blockcopy_pp_8x64_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].copy_pp = PFX(blockcopy_pp_12x32_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].copy_pp = PFX(blockcopy_pp_16x8_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = PFX(blockcopy_pp_16x24_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].copy_pp = PFX(blockcopy_pp_16x64_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = PFX(blockcopy_pp_24x64_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_neon);
>+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
>+
> // sad
> p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_neon);
> p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_neon);
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/blockcopy8.h
>--- a/source/common/arm/blockcopy8.h Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/blockcopy8.h Tue Apr 19 11:37:57 2016 +0530
>@@ -51,6 +51,21 @@
> void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
> void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
>
> void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
> void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
>diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/mc-a.S
>--- a/source/common/arm/mc-a.S Fri Apr 15 16:44:32 2016 +0530
>+++ b/source/common/arm/mc-a.S Tue Apr 19 11:37:57 2016 +0530
>@@ -61,6 +61,8 @@
> blockcopy_pp_4xN_neon 4
> blockcopy_pp_4xN_neon 8
> blockcopy_pp_4xN_neon 16
>+blockcopy_pp_4xN_neon 2
>+blockcopy_pp_4xN_neon 32
>
> .macro blockcopy_pp_16xN_neon h
> function x265_blockcopy_pp_16x\h\()_neon
>@@ -75,6 +77,7 @@
> blockcopy_pp_16xN_neon 4
> blockcopy_pp_16xN_neon 8
> blockcopy_pp_16xN_neon 12
>+blockcopy_pp_16xN_neon 24
>
> .macro blockcopy_pp_16xN1_neon h i
> function x265_blockcopy_pp_16x\h\()_neon
>@@ -109,6 +112,9 @@
> blockcopy_pp_8xN_neon 8
> blockcopy_pp_8xN_neon 16
> blockcopy_pp_8xN_neon 32
>+blockcopy_pp_8xN_neon 2
>+blockcopy_pp_8xN_neon 6
>+blockcopy_pp_8xN_neon 12
>
> function x265_blockcopy_pp_12x16_neon
> push {r4, r5}
>@@ -167,6 +173,7 @@
> blockcopy_pp_32xN_neon 24 3
> blockcopy_pp_32xN_neon 32 4
> blockcopy_pp_32xN_neon 64 8
>+blockcopy_pp_32xN_neon 48 6
>
> function x265_blockcopy_pp_48x64_neon
> push {r4, r5}
>@@ -213,6 +220,81 @@
> blockcopy_pp_64xN_neon 48 12
> blockcopy_pp_64xN_neon 64 16
>
>+.macro blockcopy_pp_2xN_neon h
>+function x265_blockcopy_pp_2x\h\()_neon
>+.rept \h
>+ ldrh r12, [r2], r3
>+ strh r12, [r0], r1
>+.endr
>+ bx lr
>+endfunc
>+.endm
>+
>+blockcopy_pp_2xN_neon 4
>+blockcopy_pp_2xN_neon 8
>+blockcopy_pp_2xN_neon 16
>+
>+.macro blockcopy_pp_6xN_neon h i
>+function x265_blockcopy_pp_6x\h\()_neon
>+ sub r1, #4
>+.rept \i
>+ vld1.8 {d0}, [r2], r3
>+ vld1.8 {d1}, [r2], r3
>+ vst1.32 {d0[0]}, [r0]!
>+ vst1.16 {d0[2]}, [r0], r1
>+ vst1.32 {d1[0]}, [r0]!
>+ vst1.16 {d1[2]}, [r0], r1
>+.endr
>+ bx lr
>+endfunc
>+.endm
>+blockcopy_pp_6xN_neon 8 4
>+blockcopy_pp_6xN_neon 16 8
>+
>+function x265_blockcopy_pp_8x64_neon
>+ mov r12, #4
>+loop_pp_8x64:
>+ subs r12, #1
>+.rept 16
>+ vld1.8 {d0}, [r2], r3
>+ vst1.8 {d0}, [r0], r1
>+.endr
>+ bne loop_pp_8x64
>+ bx lr
>+endfunc
>+
>+function x265_blockcopy_pp_12x32_neon
>+ push {r4}
>+ sub r3, #8
>+ sub r1, #8
>+ mov r12, #4
>+loop_pp_12x32:
>+ subs r12, #1
>+.rept 8
>+ vld1.8 {d0}, [r2]!
>+ ldr r4, [r2], r3
>+ vst1.8 {d0}, [r0]!
>+ str r4, [r0], r1
>+.endr
>+ bne loop_pp_12x32
>+ pop {r4}
>+ bx lr
>+endfunc
>+
>+function x265_blockcopy_pp_24x64_neon
>+ push {r4}
>+ mov r4, #4
>+loop_24x64:
>+.rept 16
>+ vld1.8 {d0, d1, d2}, [r2], r3
>+ vst1.8 {d0, d1, d2}, [r0], r1
>+.endr
>+ subs r4, r4, #1
>+ bne loop_24x64
>+ pop {r4}
>+ bx lr
>+endfunc
>+
> // void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
> .macro pixel_avg_pp_4xN_neon h
> function x265_pixel_avg_pp_4x\h\()_neon
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160421/f8c538c9/attachment-0001.html>
More information about the x265-devel
mailing list