[x265] [PATCH] arm: Implement blockcopy_pp chroma ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Wed Apr 20 13:14:57 CEST 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1461046077 -19800
# Tue Apr 19 11:37:57 2016 +0530
# Node ID 0d38844bf4b3632444fc0249a549a1e0e3e2bfc8
# Parent 534b8e2845b8156010b3c79bfa88c81c7b0b9295
arm: Implement blockcopy_pp chroma ARM NEON
diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Fri Apr 15 16:44:32 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Tue Apr 19 11:37:57 2016 +0530
@@ -462,6 +462,56 @@
p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
+ // chroma blockcopy
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].copy_pp = PFX(blockcopy_pp_2x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].copy_pp = PFX(blockcopy_pp_2x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].copy_pp = PFX(blockcopy_pp_4x2_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].copy_pp = PFX(blockcopy_pp_4x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].copy_pp = PFX(blockcopy_pp_6x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].copy_pp = PFX(blockcopy_pp_8x2_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].copy_pp = PFX(blockcopy_pp_8x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].copy_pp = PFX(blockcopy_pp_8x6_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].copy_pp = PFX(blockcopy_pp_8x32_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].copy_pp = PFX(blockcopy_pp_12x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].copy_pp = PFX(blockcopy_pp_16x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].copy_pp = PFX(blockcopy_pp_16x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].copy_pp = PFX(blockcopy_pp_16x12_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = PFX(blockcopy_pp_24x32_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = PFX(blockcopy_pp_32x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].copy_pp = PFX(blockcopy_pp_2x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].copy_pp = PFX(blockcopy_pp_4x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].copy_pp = PFX(blockcopy_pp_4x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].copy_pp = PFX(blockcopy_pp_6x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].copy_pp = PFX(blockcopy_pp_8x4_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].copy_pp = PFX(blockcopy_pp_8x12_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].copy_pp = PFX(blockcopy_pp_8x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].copy_pp = PFX(blockcopy_pp_8x64_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].copy_pp = PFX(blockcopy_pp_12x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].copy_pp = PFX(blockcopy_pp_16x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = PFX(blockcopy_pp_16x24_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].copy_pp = PFX(blockcopy_pp_16x64_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = PFX(blockcopy_pp_24x64_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
+
// sad
p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_neon);
p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_neon);
diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h Fri Apr 15 16:44:32 2016 +0530
+++ b/source/common/arm/blockcopy8.h Tue Apr 19 11:37:57 2016 +0530
@@ -51,6 +51,21 @@
void x265_blockcopy_pp_64x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
void x265_blockcopy_pp_64x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
void x265_blockcopy_pp_64x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x4_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_2x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_6x8_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_6x16_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x6_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x12_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_8x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_12x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x2_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_4x32_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_16x24_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_24x64_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
+void x265_blockcopy_pp_32x48_neon(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
void x265_cpy2Dto1D_shr_4x4_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
void x265_cpy2Dto1D_shr_8x8_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r 534b8e2845b8 -r 0d38844bf4b3 source/common/arm/mc-a.S
--- a/source/common/arm/mc-a.S Fri Apr 15 16:44:32 2016 +0530
+++ b/source/common/arm/mc-a.S Tue Apr 19 11:37:57 2016 +0530
@@ -61,6 +61,8 @@
blockcopy_pp_4xN_neon 4
blockcopy_pp_4xN_neon 8
blockcopy_pp_4xN_neon 16
+blockcopy_pp_4xN_neon 2
+blockcopy_pp_4xN_neon 32
.macro blockcopy_pp_16xN_neon h
function x265_blockcopy_pp_16x\h\()_neon
@@ -75,6 +77,7 @@
blockcopy_pp_16xN_neon 4
blockcopy_pp_16xN_neon 8
blockcopy_pp_16xN_neon 12
+blockcopy_pp_16xN_neon 24
.macro blockcopy_pp_16xN1_neon h i
function x265_blockcopy_pp_16x\h\()_neon
@@ -109,6 +112,9 @@
blockcopy_pp_8xN_neon 8
blockcopy_pp_8xN_neon 16
blockcopy_pp_8xN_neon 32
+blockcopy_pp_8xN_neon 2
+blockcopy_pp_8xN_neon 6
+blockcopy_pp_8xN_neon 12
function x265_blockcopy_pp_12x16_neon
push {r4, r5}
@@ -167,6 +173,7 @@
blockcopy_pp_32xN_neon 24 3
blockcopy_pp_32xN_neon 32 4
blockcopy_pp_32xN_neon 64 8
+blockcopy_pp_32xN_neon 48 6
function x265_blockcopy_pp_48x64_neon
push {r4, r5}
@@ -213,6 +220,81 @@
blockcopy_pp_64xN_neon 48 12
blockcopy_pp_64xN_neon 64 16
+.macro blockcopy_pp_2xN_neon h
+function x265_blockcopy_pp_2x\h\()_neon
+.rept \h
+ ldrh r12, [r2], r3
+ strh r12, [r0], r1
+.endr
+ bx lr
+endfunc
+.endm
+
+blockcopy_pp_2xN_neon 4
+blockcopy_pp_2xN_neon 8
+blockcopy_pp_2xN_neon 16
+
+.macro blockcopy_pp_6xN_neon h i
+function x265_blockcopy_pp_6x\h\()_neon
+ sub r1, #4
+.rept \i
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ vst1.32 {d0[0]}, [r0]!
+ vst1.16 {d0[2]}, [r0], r1
+ vst1.32 {d1[0]}, [r0]!
+ vst1.16 {d1[2]}, [r0], r1
+.endr
+ bx lr
+endfunc
+.endm
+blockcopy_pp_6xN_neon 8 4
+blockcopy_pp_6xN_neon 16 8
+
+function x265_blockcopy_pp_8x64_neon
+ mov r12, #4
+loop_pp_8x64:
+ subs r12, #1
+.rept 16
+ vld1.8 {d0}, [r2], r3
+ vst1.8 {d0}, [r0], r1
+.endr
+ bne loop_pp_8x64
+ bx lr
+endfunc
+
+function x265_blockcopy_pp_12x32_neon
+ push {r4}
+ sub r3, #8
+ sub r1, #8
+ mov r12, #4
+loop_pp_12x32:
+ subs r12, #1
+.rept 8
+ vld1.8 {d0}, [r2]!
+ ldr r4, [r2], r3
+ vst1.8 {d0}, [r0]!
+ str r4, [r0], r1
+.endr
+ bne loop_pp_12x32
+ pop {r4}
+ bx lr
+endfunc
+
+function x265_blockcopy_pp_24x64_neon
+ push {r4}
+ mov r4, #4
+loop_24x64:
+.rept 16
+ vld1.8 {d0, d1, d2}, [r2], r3
+ vst1.8 {d0, d1, d2}, [r0], r1
+.endr
+ subs r4, r4, #1
+ bne loop_24x64
+ pop {r4}
+ bx lr
+endfunc
+
// void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
.macro pixel_avg_pp_4xN_neon h
function x265_pixel_avg_pp_4x\h\()_neon
More information about the x265-devel
mailing list