[x265] [PATCH] arm: Implement pixel_satd ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Wed Apr 13 10:53:31 CEST 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1459339153 -19800
# Wed Mar 30 17:29:13 2016 +0530
# Node ID 68b2e7ebe0f05053d106fcebef5839f62bb61aa6
# Parent e7d937ad1ea341eeebd210188e08540ab6104fef
arm: Implement pixel_satd ARM NEON
diff -r e7d937ad1ea3 -r 68b2e7ebe0f0 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Apr 13 03:01:46 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Wed Mar 30 17:29:13 2016 +0530
@@ -43,6 +43,74 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // luma satd
+ p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_neon);
+ p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_neon);
+ p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_neon);
+ p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_neon);
+ p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_neon);
+ p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_neon);
+ p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_neon);
+ p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
+ p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_neon);
+ p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_neon);
+ p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_neon);
+ p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_neon);
+ p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_neon);
+ p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_neon);
+ p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_neon);
+ p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_neon);
+ p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_neon);
+ p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_neon);
+ p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_neon);
+ p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_neon);
+ p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_neon);
+ p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_neon);
+ p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_neon);
+ p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_neon);
+
+ // chroma satd
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon);
+
// chroma_hpp
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = PFX(interp_4tap_horiz_pp_4x2_neon);
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = PFX(interp_4tap_horiz_pp_4x4_neon);
@@ -498,7 +566,7 @@
// planecopy
p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
- //p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_neon);
+ p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_neon);
p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_neon);
p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
diff -r e7d937ad1ea3 -r 68b2e7ebe0f0 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Wed Apr 13 03:01:46 2016 +0530
+++ b/source/common/arm/pixel-util.S Wed Mar 30 17:29:13 2016 +0530
@@ -735,20 +735,930 @@
bx lr
endfunc
+//******* satd *******
+.macro satd_4x4_neon
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d0[]}, [r0,:32], r1
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d2[]}, [r0,:32], r1
+ vld1.32 {d1[1]}, [r2], r3
+ vld1.32 {d0[1]}, [r0,:32], r1
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d2[1]}, [r0,:32], r1
+ vsubl.u8 q0, d0, d1
+ vsubl.u8 q1, d2, d3
+ SUMSUB_AB q2, q3, q0, q1
+ SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7
+ HADAMARD 1, sumsub, q2, q3, q0, q1
+ HADAMARD 2, amax, q0,, q2, q3
+ HORIZ_ADD d0, d0, d1
+.endm
+
+function x265_pixel_satd_4x4_neon
+ satd_4x4_neon
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+.macro LOAD_DIFF_8x4_1 q0 q1 q2 q3
+ vld1.32 {d1}, [r2], r3
+ vld1.32 {d0}, [r0,:64], r1
+ vsubl.u8 \q0, d0, d1
+ vld1.32 {d3}, [r2], r3
+ vld1.32 {d2}, [r0,:64], r1
+ vsubl.u8 \q1, d2, d3
+ vld1.32 {d5}, [r2], r3
+ vld1.32 {d4}, [r0,:64], r1
+ vsubl.u8 \q2, d4, d5
+ vld1.32 {d7}, [r2], r3
+ vld1.32 {d6}, [r0,:64], r1
+ vsubl.u8 \q3, d6, d7
+.endm
+
+.macro x265_satd_4x8_8x4_end_neon
+ vadd.s16 q0, q8, q10
+ vadd.s16 q1, q9, q11
+ vsub.s16 q2, q8, q10
+ vsub.s16 q3, q9, q11
+
+ vtrn.16 q0, q1
+ vadd.s16 q8, q0, q1
+ vtrn.16 q2, q3
+ vsub.s16 q9, q0, q1
+ vadd.s16 q10, q2, q3
+ vsub.s16 q11, q2, q3
+ vtrn.32 q8, q10
+ vabs.s16 q8, q8
+ vtrn.32 q9, q11
+ vabs.s16 q10, q10
+ vabs.s16 q9, q9
+ vabs.s16 q11, q11
+ vmax.u16 q0, q8, q10
+ vmax.u16 q1, q9, q11
+ vadd.u16 q0, q0, q1
+ HORIZ_ADD d0, d0, d1
+.endm
+
+.macro pixel_satd_4x8_neon
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d0[]}, [r0,:32], r1
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d2[]}, [r0,:32], r1
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d4[]}, [r0,:32], r1
+ vld1.32 {d7[]}, [r2], r3
+ vld1.32 {d6[]}, [r0,:32], r1
+
+ vld1.32 {d1[1]}, [r2], r3
+ vld1.32 {d0[1]}, [r0,:32], r1
+ vsubl.u8 q0, d0, d1
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d2[1]}, [r0,:32], r1
+ vsubl.u8 q1, d2, d3
+ vld1.32 {d5[1]}, [r2], r3
+ vld1.32 {d4[1]}, [r0,:32], r1
+ vsubl.u8 q2, d4, d5
+ vld1.32 {d7[1]}, [r2], r3
+ SUMSUB_AB q8, q9, q0, q1
+ vld1.32 {d6[1]}, [r0,:32], r1
+ vsubl.u8 q3, d6, d7
+ SUMSUB_AB q10, q11, q2, q3
+ x265_satd_4x8_8x4_end_neon
+.endm
+
+function x265_pixel_satd_4x8_neon
+ pixel_satd_4x8_neon
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_4x16_neon
+ push {r4, r5}
+ eor r4, r4
+ pixel_satd_4x8_neon
+ vmov.32 r5, d0[0]
+ add r4, r5
+ pixel_satd_4x8_neon
+ vmov.32 r5, d0[0]
+ add r0, r5, r4
+ pop {r4, r5}
+ bx lr
+endfunc
+
+function x265_pixel_satd_4x32_neon
+ push {r4, r5}
+ eor r4, r4
+.rept 4
+ pixel_satd_4x8_neon
+ vmov.32 r5, d0[0]
+ add r4, r5
+.endr
+ mov r0, r4
+ pop {r4, r5}
+ bx lr
+endfunc
+
+function x265_pixel_satd_12x16_neon
+ push {r4, r5, r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ mov r4, r0
+ mov r5, r2
+ eor r7, r7
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+
+ add r0, r4, #4
+ add r2, r5, #4
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+
+ add r0, r4, #8
+ add r2, r5, #8
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r0, r7, r6
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_12x32_neon
+ push {r4, r5, r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ mov r4, r0
+ mov r5, r2
+ eor r7, r7
+.rept 4
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+.endr
+
+ add r0, r4, #4
+ add r2, r5, #4
+.rept 4
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+.endr
+
+ add r0, r4, #8
+ add r2, r5, #8
+.rept 4
+ pixel_satd_4x8_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+.endr
+
+ mov r0, r7
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_8x4_neon
+ push {r4, r5, r6}
+ mov r4, r0
+ mov r5, r2
+ satd_4x4_neon
+ add r0, r4, #4
+ add r2, r5, #4
+ vmov.32 r6, d0[0]
+ satd_4x4_neon
+ vmov.32 r0, d0[0]
+ add r0, r0, r6
+ pop {r4, r5, r6}
+ bx lr
+endfunc
+
+function x265_pixel_satd_8x8_neon
+ mov ip, lr
+ push {r4, r5, r6, r7}
+ eor r4, r4
+ mov r6, r0
+ mov r7, r2
+ pixel_satd_4x8_neon
+ vmov.32 r5, d0[0]
+ add r4, r5
+ add r0, r6, #4
+ add r2, r7, #4
+ pixel_satd_4x8_neon
+ vmov.32 r5, d0[0]
+ add r0, r4, r5
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_8x12_neon
+ push {r4, r5, r6, r7}
+ mov r4, r0
+ mov r5, r2
+ eor r7, r7
+ satd_4x4_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+ add r0, r4, #4
+ add r2, r5, #4
+ satd_4x4_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+.rept 2
+ sub r0, #4
+ sub r2, #4
+ mov r4, r0
+ mov r5, r2
+ satd_4x4_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+ add r0, r4, #4
+ add r2, r5, #4
+ satd_4x4_neon
+ vmov.32 r6, d0[0]
+ add r7, r6
+.endr
+ mov r0, r7
+ pop {r4, r5, r6, r7}
+ bx lr
+endfunc
+
+function x265_pixel_satd_8x16_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_8x32_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+.rept 3
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_8x64_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+.rept 7
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_satd_8x8_neon
+ LOAD_DIFF_8x4_1 q8, q9, q10, q11
+ vld1.64 {d7}, [r2], r3
+ vld1.64 {d6}, [r0,:64], r1
+ vsubl.u8 q12, d6, d7
+ SUMSUB_AB q0, q1, q8, q9
+
+ vld1.64 {d17}, [r2], r3
+ vld1.64 {d16}, [r0,:64], r1
+ vsubl.u8 q13, d16, d17
+ SUMSUB_AB q2, q3, q10, q11
+
+ vld1.64 {d19}, [r2], r3
+ vld1.64 {d18}, [r0,:64], r1
+ vsubl.u8 q14, d18, d19
+ SUMSUB_AB q8, q10, q0, q2
+
+ vld1.64 {d1}, [r2], r3
+ vld1.64 {d0}, [r0,:64], r1
+ vsubl.u8 q15, d0, d1
+ SUMSUB_AB q9, q11, q1, q3
+endfunc
+
+// one vertical hadamard pass and two horizontal
+function x265_satd_8x4v_8x8h_neon, export=0
+ SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15
+ SUMSUB_AB q12, q14, q0, q2
+ SUMSUB_AB q13, q15, q1, q3
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+
+ SUMSUB_AB q0, q1, q8, q9
+ SUMSUB_AB q2, q3, q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+
+ SUMSUB_AB q8, q9, q12, q13
+ SUMSUB_AB q10, q11, q14, q15
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ ABS2 q0, q2
+ ABS2 q1, q3
+
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ ABS2 q8, q10
+ ABS2 q9, q11
+
+ vmax.s16 q12, q0, q2
+ vmax.s16 q13, q1, q3
+ vmax.s16 q14, q8, q10
+ vmax.s16 q15, q9, q11
+ bx lr
+endfunc
+
+function x265_satd_16x4_neon, export=0
+ vld1.64 {d2-d3}, [r2], r3
+ vld1.64 {d0-d1}, [r0,:128], r1
+ vsubl.u8 q8, d0, d2
+ vsubl.u8 q12, d1, d3
+
+ vld1.64 {d6-d7}, [r2], r3
+ vld1.64 {d4-d5}, [r0,:128], r1
+ vsubl.u8 q9, d4, d6
+ vsubl.u8 q13, d5, d7
+
+ vld1.64 {d2-d3}, [r2], r3
+ vld1.64 {d0-d1}, [r0,:128], r1
+ vsubl.u8 q10, d0, d2
+ vsubl.u8 q14, d1, d3
+
+ vld1.64 {d6-d7}, [r2], r3
+ vld1.64 {d4-d5}, [r0,:128], r1
+ vsubl.u8 q11, d4, d6
+ vsubl.u8 q15, d5, d7
+
+ vadd.s16 q0, q8, q9
+ vsub.s16 q1, q8, q9
+ SUMSUB_AB q2, q3, q10, q11
+ SUMSUB_ABCD q8, q10, q9, q11, q0, q2, q1, q3
+ b x265_satd_8x4v_8x8h_neon
+endfunc
+
+function x265_pixel_satd_16x4_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_16x8_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_16x12_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+.rept 2
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_16x16_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+.rept 3
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_16x24_neon
+ vpush {d8-d11}
+ mov ip, lr
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+.rept 5
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+.macro pixel_satd_16x32_neon
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q12, q13
+ vadd.u16 q5, q14, q15
+.rept 7
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+.endm
+
+function x265_pixel_satd_16x32_neon
+ vpush {d8-d11}
+ mov ip, lr
+ pixel_satd_16x32_neon
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vpop {d8-d11}
+ mov lr, ip
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_satd_16x64_neon
+ push {r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ pixel_satd_16x32_neon
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+ add r7, r6
+
+ veor q4, q5
+ veor q5, q5
+ pixel_satd_16x32_neon
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+ add r0, r7, r6
+ vpop {d8-d11}
+ pop {r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_24x32_neon
+ push {r4, r5, r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ mov r4, r0
+ mov r5, r2
+.rept 3
+ veor q4, q4
+ veor q5, q5
+.rept 4
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+ add r7, r6
+ add r4, #8
+ add r5, #8
+ mov r0, r4
+ mov r2, r5
+.endr
+ mov r0, r7
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_24x64_neon
+ push {r4, r5, r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ mov r4, r0
+ mov r5, r2
+.rept 3
+ veor q4, q4
+ veor q5, q5
+.rept 4
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+ add r7, r6
+ add r4, #8
+ add r5, #8
+ mov r0, r4
+ mov r2, r5
+.endr
+
+ sub r4, #24
+ sub r5, #24
+ add r0, r4, r1, lsl #5
+ add r2, r5, r3, lsl #5
+ mov r4, r0
+ mov r5, r2
+.rept 3
+ veor q4, q4
+ veor q5, q5
+.rept 4
+ bl x265_satd_8x8_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endr
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+ add r7, r6
+ add r4, #8
+ add r5, #8
+ mov r0, r4
+ mov r2, r5
+.endr
+ mov r0, r7
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+.macro pixel_satd_32x8
+ mov r4, r0
+ mov r5, r2
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ add r0, r4, #16
+ add r2, r5, #16
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+
+ bl x265_satd_16x4_neon
+ vadd.u16 q4, q4, q12
+ vadd.u16 q5, q5, q13
+ vadd.u16 q4, q4, q14
+ vadd.u16 q5, q5, q15
+.endm
+
+function x265_pixel_satd_32x8_neon
+ push {r4, r5}
+ vpush {d8-d11}
+ mov ip, lr
+ veor q4, q4
+ veor q5, q5
+ pixel_satd_32x8
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r0, d0[0]
+ vpop {d8-d11}
+ pop {r4, r5}
+ mov lr, ip
+ bx lr
+endfunc
+
+.macro satd_32x16_neon
+ veor q4, q4
+ veor q5, q5
+ pixel_satd_32x8
+ sub r0, #16
+ sub r2, #16
+ pixel_satd_32x8
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+.endm
+
+function x265_pixel_satd_32x16_neon
+ push {r4, r5, r6}
+ vpush {d8-d11}
+ mov ip, lr
+ satd_32x16_neon
+ mov r0, r6
+ vpop {d8-d11}
+ pop {r4, r5, r6}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_32x24_neon
+ push {r4, r5, r6}
+ vpush {d8-d11}
+ mov ip, lr
+ satd_32x16_neon
+ veor q4, q4
+ veor q5, q5
+ sub r0, #16
+ sub r2, #16
+ pixel_satd_32x8
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r0, d0[0]
+ add r0, r6
+ vpop {d8-d11}
+ pop {r4, r5, r6}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_32x32_neon
+ push {r4, r5, r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ satd_32x16_neon
+ sub r0, #16
+ sub r2, #16
+ add r7, r6
+ satd_32x16_neon
+ add r0, r7, r6
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_32x48_neon
+ push {r4, r5, r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+.rept 2
+ satd_32x16_neon
+ sub r0, #16
+ sub r2, #16
+ add r7, r6
+.endr
+ satd_32x16_neon
+ add r0, r7, r6
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_32x64_neon
+ push {r4, r5, r6, r7}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+.rept 3
+ satd_32x16_neon
+ sub r0, #16
+ sub r2, #16
+ add r7, r6
+.endr
+ satd_32x16_neon
+ add r0, r7, r6
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7}
+ mov lr, ip
+ bx lr
+endfunc
+
+.macro satd_64x16_neon
+ mov r8, r0
+ mov r9, r2
+ satd_32x16_neon
+ add r7, r6
+ add r0, r8, #32
+ add r2, r9, #32
+ satd_32x16_neon
+ add r7, r6
+.endm
+
+function x265_pixel_satd_64x16_neon
+ push {r4, r5, r6, r7, r8, r9}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ satd_64x16_neon
+ mov r0, r7
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7, r8, r9}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_64x32_neon
+ push {r4, r5, r6, r7, r8, r9}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ satd_64x16_neon
+ sub r0, #48
+ sub r2, #48
+ satd_64x16_neon
+ mov r0, r7
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7, r8, r9}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_64x48_neon
+ push {r4, r5, r6, r7, r8, r9}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ satd_64x16_neon
+ sub r0, #48
+ sub r2, #48
+ satd_64x16_neon
+ sub r0, #48
+ sub r2, #48
+ satd_64x16_neon
+ mov r0, r7
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7, r8, r9}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_64x64_neon
+ push {r4, r5, r6, r7, r8, r9}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ satd_64x16_neon
+ sub r0, #48
+ sub r2, #48
+ satd_64x16_neon
+ sub r0, #48
+ sub r2, #48
+ satd_64x16_neon
+ sub r0, #48
+ sub r2, #48
+ satd_64x16_neon
+ mov r0, r7
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7, r8, r9}
+ mov lr, ip
+ bx lr
+endfunc
+
+function x265_pixel_satd_48x64_neon
+ push {r4, r5, r6, r7, r8, r9}
+ vpush {d8-d11}
+ mov ip, lr
+ eor r7, r7
+ mov r8, r0
+ mov r9, r2
+.rept 3
+ satd_32x16_neon
+ sub r0, #16
+ sub r2, #16
+ add r7, r6
+.endr
+ satd_32x16_neon
+ add r7, r6
+
+ add r0, r8, #32
+ add r2, r9, #32
+ pixel_satd_16x32_neon
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+ add r7, r6
+
+ veor q4, q5
+ veor q5, q5
+ pixel_satd_16x32_neon
+ vadd.u16 q0, q4, q5
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r6, d0[0]
+ add r0, r7, r6
+
+ vpop {d8-d11}
+ pop {r4, r5, r6, r7, r8, r9}
+ mov lr, ip
+ bx lr
+endfunc
.macro LOAD_DIFF_8x4 q0 q1 q2 q3
- vld1.32 {d1}, [r2], r3
- vld1.32 {d0}, [r0,:64], r1
- vsubl.u8 \q0, d0, d1
- vld1.32 {d3}, [r2], r3
- vld1.32 {d2}, [r0,:64], r1
- vsubl.u8 \q1, d2, d3
- vld1.32 {d5}, [r2], r3
- vld1.32 {d4}, [r0,:64], r1
- vsubl.u8 \q2, d4, d5
- vld1.32 {d7}, [r2], r3
- vld1.32 {d6}, [r0,:64], r1
- vsubl.u8 \q3, d6, d7
+ vld1.32 {d1}, [r2], r3
+ vld1.32 {d0}, [r0,:64], r1
+ vsubl.u8 \q0, d0, d1
+ vld1.32 {d3}, [r2], r3
+ vld1.32 {d2}, [r0,:64], r1
+ vsubl.u8 \q1, d2, d3
+ vld1.32 {d5}, [r2], r3
+ vld1.32 {d4}, [r0,:64], r1
+ vsubl.u8 \q2, d4, d5
+ vld1.32 {d7}, [r2], r3
+ vld1.32 {d6}, [r0,:64], r1
+ vsubl.u8 \q3, d6, d7
.endm
.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
diff -r e7d937ad1ea3 -r 68b2e7ebe0f0 source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h Wed Apr 13 03:01:46 2016 +0530
+++ b/source/common/arm/pixel-util.h Wed Mar 30 17:29:13 2016 +0530
@@ -38,6 +38,39 @@
void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
More information about the x265-devel
mailing list