[x265] [PATCH] asm_arm: NEON version of psyCost_4x4
Min Chen
chenm003 at 163.com
Wed May 25 17:19:08 CEST 2016
# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1464189527 18000
# Node ID 5abead62ce63ec2a472a2424d54d40f015146995
# Parent 4723933fdec920debefe606d50a9a312f7bc7f6b
asm_arm: NEON version of psyCost_4x4
---
source/common/arm/asm-primitives.cpp | 3 ++
source/common/arm/pixel-util.S | 66 ++++++++++++++++++++++++++++++++++
source/common/arm/pixel-util.h | 3 ++
3 files changed, 72 insertions(+), 0 deletions(-)
diff -r 4723933fdec9 -r 5abead62ce63 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Fri May 13 09:32:11 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Wed May 25 10:18:47 2016 -0500
@@ -1007,6 +1007,9 @@
p.cu[BLOCK_4x4].dct = PFX(dct_4x4_neon);
p.cu[BLOCK_8x8].dct = PFX(dct_8x8_neon);
+#if !HIGH_BIT_DEPTH
+ p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
+#endif // !HIGH_BIT_DEPTH
}
if (cpuMask & X265_CPU_ARMV6)
{
diff -r 4723933fdec9 -r 5abead62ce63 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Fri May 13 09:32:11 2016 +0530
+++ b/source/common/arm/pixel-util.S Wed May 25 10:18:47 2016 -0500
@@ -3,6 +3,7 @@
*
* Authors: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
* Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
+ * Min Chen <min.chen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -2389,3 +2390,68 @@
vst4.32 {d16-d19}, [r12]
bx lr
endfunc
+
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+function x265_psyCost_4x4_neon
+ vld1.32 {d16[]}, [r0,:32], r1 // d16 = [A03 A02 A01 A00 A03 A02 A01 A00]
+ vld1.32 {d17[]}, [r0,:32], r1 // d17 = [A13 A12 A11 A10 A13 A12 A11 A10]
+ vld1.32 {d16[1]}, [r0,:32], r1 // d16 = [A23 A22 A21 A20 A03 A02 A01 A00]
+ vld1.32 {d17[1]}, [r0,:32], r1 // d17 = [A33 A32 A31 A30 A13 A12 A11 A10]
+
+ vld1.32 {d18[]}, [r2,:32], r3 // d18 = [B03 B02 B01 B00 B03 B02 B01 B00]
+ vld1.32 {d19[]}, [r2,:32], r3 // d19 = [B13 B12 B11 B10 B13 B12 B11 B10]
+ vld1.32 {d18[1]}, [r2,:32], r3 // d18 = [B23 B22 B21 B20 B03 B02 B01 B00]
+ vld1.32 {d19[1]}, [r2,:32], r3 // d19 = [B33 B32 B31 B30 B13 B12 B11 B10]
+
+ vaddl.u8 q2, d16, d17 // q2 = [2+3 0+1]
+ vsubl.u8 q3, d16, d17 // q3 = [2-3 0-1]
+ vaddl.u8 q12, d18, d19
+ vsubl.u8 q13, d18, d19
+
+ SUMSUB_ABCD d0, d2, d1, d3, d4, d5, d6, d7 // q0 = [(0-1)+(2-3) (0+1)+(2+3)], q1 = [(0-1)-(2-3) (0+1)-(2+3)]
+ SUMSUB_ABCD d20, d22, d21, d23, d24, d25, d26, d27
+
+ // Hadamard-1D
+ vtrn.16 q0, q1
+ vtrn.16 q10, q11
+ SUMSUB_AB q2, q3, q0, q1 // q2 = [((0-1)-(2-3))+((0-1)+(2-3)) ((0+1)-(2+3))+((0+1)+(2+3))], q3 = [((0-1)-(2-3))-((0-1)+(2-3)) ((0+1)-(2+3))-((0+1)+(2+3))]
+ SUMSUB_AB q12, q13, q10, q11
+
+ // SAD Stage-0
+ vaddl.u8 q14, d16, d17 // q14 = [S23x4 S01x4]
+ vaddl.u8 q15, d18, d19
+
+ // Hadamard-2D
+ vtrn.32 q2, q3
+ vtrn.32 q12, q13
+ vabs.s16 q2, q2
+ vabs.s16 q12, q12
+ vabs.s16 q3, q3
+ vabs.s16 q13, q13
+
+ // SAD Stage-1
+ vadd.u16 d28, d29 // SAD: reduce to 4 elements
+ vadd.u16 d30, d31
+
+ vmax.s16 q0, q2, q3
+ vmax.s16 q10, q12, q13
+
+ // SAD Stage-2
+ vpadd.u16 d28, d30 // SAD: reduce to 2 elements
+
+ // SAD & SATD Final Stage
+ vswp d1, d20
+ vadd.u16 q0, q10
+ vpaddl.u16 d28, d28 // d28 = SAD_DWORD[B A]
+ vpadd.u16 d0, d1
+ vshr.u32 d28, #2 // d28 = SAD_DWORD[B A] >> 2
+ vpaddl.u16 d0, d0 // d0 = SATD_DWORD[B A]
+ vsub.s32 d0, d28 // d0 = SATD - SAD
+ vmov.32 r0, d0[0]
+ vmov.32 r1, d0[1]
+ subs r0, r1
+ rsbmi r0, r0, #0
+
+ bx lr
+endfunc
+
diff -r 4723933fdec9 -r 5abead62ce63 source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h Fri May 13 09:32:11 2016 +0530
+++ b/source/common/arm/pixel-util.h Wed May 25 10:18:47 2016 -0500
@@ -86,4 +86,7 @@
void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
+
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+
#endif // ifndef X265_PIXEL_UTIL_ARM_H
More information about the x265-devel
mailing list