[x265] [PATCH] asm_arm: NEON version of psyCost_4x4

Min Chen chenm003 at 163.com
Wed May 25 17:19:08 CEST 2016


# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1464189527 18000
# Node ID 5abead62ce63ec2a472a2424d54d40f015146995
# Parent  4723933fdec920debefe606d50a9a312f7bc7f6b
asm_arm: NEON version of psyCost_4x4
---
 source/common/arm/asm-primitives.cpp |    3 ++
 source/common/arm/pixel-util.S       |   66 ++++++++++++++++++++++++++++++++++
 source/common/arm/pixel-util.h       |    3 ++
 3 files changed, 72 insertions(+), 0 deletions(-)

diff -r 4723933fdec9 -r 5abead62ce63 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp	Fri May 13 09:32:11 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Wed May 25 10:18:47 2016 -0500
@@ -1007,6 +1007,9 @@
 
         p.cu[BLOCK_4x4].dct = PFX(dct_4x4_neon);
         p.cu[BLOCK_8x8].dct = PFX(dct_8x8_neon);
+#if !HIGH_BIT_DEPTH
+        p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
+#endif // !HIGH_BIT_DEPTH
     }
     if (cpuMask & X265_CPU_ARMV6)
     {
diff -r 4723933fdec9 -r 5abead62ce63 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S	Fri May 13 09:32:11 2016 +0530
+++ b/source/common/arm/pixel-util.S	Wed May 25 10:18:47 2016 -0500
@@ -3,6 +3,7 @@
  *
  * Authors: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
  *          Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
+ *          Min Chen <min.chen at multicorewareinc.com>
  * 
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -2389,3 +2390,68 @@
     vst4.32         {d16-d19}, [r12]
     bx              lr
 endfunc
+
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)
+function x265_psyCost_4x4_neon
+    vld1.32         {d16[]}, [r0,:32], r1                   // d16 = [A03 A02 A01 A00 A03 A02 A01 A00]
+    vld1.32         {d17[]}, [r0,:32], r1                   // d17 = [A13 A12 A11 A10 A13 A12 A11 A10]
+    vld1.32         {d16[1]}, [r0,:32], r1                  // d16 = [A23 A22 A21 A20 A03 A02 A01 A00]
+    vld1.32         {d17[1]}, [r0,:32], r1                  // d17 = [A33 A32 A31 A30 A13 A12 A11 A10]
+
+    vld1.32         {d18[]}, [r2,:32], r3                   // d18 = [B03 B02 B01 B00 B03 B02 B01 B00]
+    vld1.32         {d19[]}, [r2,:32], r3                   // d19 = [B13 B12 B11 B10 B13 B12 B11 B10]
+    vld1.32         {d18[1]}, [r2,:32], r3                  // d18 = [B23 B22 B21 B20 B03 B02 B01 B00]
+    vld1.32         {d19[1]}, [r2,:32], r3                  // d19 = [B33 B32 B31 B30 B13 B12 B11 B10]
+
+    vaddl.u8        q2, d16, d17                            // q2 = [2+3 0+1]
+    vsubl.u8        q3, d16, d17                            // q3 = [2-3 0-1]
+    vaddl.u8        q12, d18, d19
+    vsubl.u8        q13, d18, d19
+
+    SUMSUB_ABCD     d0, d2, d1, d3, d4, d5, d6, d7          // q0 = [(0-1)+(2-3) (0+1)+(2+3)], q1 = [(0-1)-(2-3) (0+1)-(2+3)]
+    SUMSUB_ABCD     d20, d22, d21, d23, d24, d25, d26, d27
+
+    // Hadamard-1D
+    vtrn.16         q0, q1
+    vtrn.16         q10, q11
+    SUMSUB_AB       q2, q3, q0, q1                          // q2 = [((0-1)-(2-3))+((0-1)+(2-3)) ((0+1)-(2+3))+((0+1)+(2+3))], q3 = [((0-1)-(2-3))-((0-1)+(2-3)) ((0+1)-(2+3))-((0+1)+(2+3))]
+    SUMSUB_AB       q12, q13, q10, q11
+
+    // SAD Stage-0
+    vaddl.u8        q14, d16, d17                           // q14 = [S23x4 S01x4]
+    vaddl.u8        q15, d18, d19
+
+    // Hadamard-2D
+    vtrn.32         q2, q3
+    vtrn.32         q12, q13
+    vabs.s16        q2, q2
+    vabs.s16        q12, q12
+    vabs.s16        q3, q3
+    vabs.s16        q13, q13
+
+    // SAD Stage-1
+    vadd.u16        d28, d29                                // SAD: reduce to 4 elements
+    vadd.u16        d30, d31
+
+    vmax.s16        q0, q2, q3
+    vmax.s16        q10, q12, q13
+
+    // SAD Stage-2
+    vpadd.u16       d28, d30                                // SAD: reduce to 2 elements
+
+    // SAD & SATD Final Stage
+    vswp            d1, d20
+    vadd.u16        q0, q10
+    vpaddl.u16      d28, d28                                // d28 = SAD_DWORD[B A]
+    vpadd.u16       d0, d1
+    vshr.u32        d28, #2                                 // d28 = SAD_DWORD[B A] >> 2
+    vpaddl.u16      d0, d0                                  // d0 = SATD_DWORD[B A]
+    vsub.s32        d0, d28                                 // d0 = SATD - SAD
+    vmov.32         r0, d0[0]
+    vmov.32         r1, d0[1]
+    subs            r0, r1
+    rsbmi           r0, r0, #0
+
+    bx              lr
+endfunc
+
diff -r 4723933fdec9 -r 5abead62ce63 source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h	Fri May 13 09:32:11 2016 +0530
+++ b/source/common/arm/pixel-util.h	Wed May 25 10:18:47 2016 -0500
@@ -86,4 +86,7 @@
 void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
 
 void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
+
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
+
 #endif // ifndef X265_PIXEL_UTIL_ARM_H



More information about the x265-devel mailing list