[x265] [PATCH] asm: ARM NEON version of DCT[4x4]
Min Chen
chenm003 at 163.com
Wed Apr 27 17:57:35 CEST 2016
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1461651296 -19800
# Node ID 0ca4769256c992c7fcef3d9110cb113e0ce88b56
# Parent 19cced21060f71e8efe5f2544ccb14f9273fd93c
asm: ARM NEON version of DCT[4x4]
diff -r 19cced21060f -r 0ca4769256c9 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Tue Apr 26 15:06:55 2016 -0700
+++ b/source/common/CMakeLists.txt Tue Apr 26 11:44:56 2016 +0530
@@ -89,7 +89,7 @@
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
# add ARM assembly/intrinsic files here
- set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S)
+ set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)
set(VEC_PRIMITIVES)
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff -r 19cced21060f -r 0ca4769256c9 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Tue Apr 26 15:06:55 2016 -0700
+++ b/source/common/arm/asm-primitives.cpp Tue Apr 26 11:44:56 2016 +0530
@@ -34,6 +34,7 @@
#include "pixel.h"
#include "pixel-util.h"
#include "ipfilter8.h"
+#include "dct8.h"
}
namespace X265_NS {
@@ -820,6 +821,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon);
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon);
+ p.cu[BLOCK_4x4].dct = PFX(dct_4x4_neon);
}
if (cpuMask & X265_CPU_ARMV6)
{
diff -r 19cced21060f -r 0ca4769256c9 source/common/arm/dct-a.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/arm/dct-a.S Tue Apr 26 11:44:56 2016 +0530
@@ -0,0 +1,122 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Min Chen <chenm003 at 163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.align 4
+
+// dst[0 * line] = ((64 * E[0] + 64 * E[1] + add) >> shift);
+// dst[2 * line] = ((64 * E[0] - 64 * E[1] + add) >> shift);
+// dst[1 * line] = ((83 * O[0] + 36 * O[1] + add) >> shift);
+// dst[3 * line] = ((36 * O[0] - 83 * O[1] + add) >> shift);
+
+/* void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride) */
+function x265_dct_4x4_neon
+ mov r2, r2, lsl #1
+ vld1.16 {d0}, [r0, :64], r2 // d0 = [03 02 01 00]
+ vld1.16 {d1}, [r0, :64], r2 // d1 = [13 12 11 10]
+ vld1.16 {d2}, [r0, :64], r2 // d2 = [23 22 21 20]
+ vld1.16 {d3}, [r0, :64] // d3 = [33 32 31 30]
+
+ vtrn.32 q0, q1 // q0 = [31 30 11 10 21 20 01 00], q1 = [33 32 13 12 23 22 03 02]
+ vrev32.16 q1, q1 // q1 = [32 33 12 13 22 23 02 03]
+
+ movconst r0, 0x00240053
+ movconst r2, 0xFFAD0024
+
+ // DCT-1D
+ vadd.s16 q2, q0, q1 // q2 = [E31 E30 E11 E10 E21 E20 E01 E00]
+ vsub.s16 q3, q0, q1 // q3 = [O31 O30 O11 O10 O21 O20 O01 O00]
+ vdup.32 d16, r0 // d16 = [ 36 83]
+ vdup.32 d17, r2 // d17 = [-83 36]
+ vtrn.16 d4, d5 // d4 = [E30 E20 E10 E00], d5 = [E31 E21 E11 E01]
+ vtrn.32 d6, d7 // q3 = [O31 O30 O21 O20 O11 O10 O01 O00]
+
+ vmull.s16 q9, d6, d16
+ vmull.s16 q10, d7, d16 // [q9, q10] = [ 36*O1 83*O0] -> [1]
+ vmull.s16 q11, d6, d17
+ vmull.s16 q12, d7, d17 // [q11,q12] = [-83*O1 36*O0] -> [3]
+
+ vadd.s16 d0, d4, d5 // d0 = [E0 + E1]
+ vsub.s16 d1, d4, d5 // d1 = [E0 - E1]
+
+ vpadd.s32 d18, d18, d19 // q9 = [1]
+ vpadd.s32 d19, d20, d21
+ vpadd.s32 d20, d22, d23 // q10 = [3]
+ vpadd.s32 d21, d24, d25
+
+ vshll.s16 q1, d0, #6 // q1 = 64 * [0]
+ vshll.s16 q2, d1, #6 // q2 = 64 * [2]
+
+ // TODO: Dynamic Range is 11+6-1 bits
+ vqrshrn.s32 d25, q9, 1 // d25 = R[13 12 11 10]
+ vqrshrn.s32 d24, q1, 1 // d24 = R[03 02 01 00]
+ vqrshrn.s32 d26, q2, 1 // q26 = R[23 22 21 20]
+ vqrshrn.s32 d27, q10, 1 // d27 = R[33 32 31 30]
+
+
+ // DCT-2D
+ vmovl.s16 q0, d16 // q14 = [ 36 83]
+
+ vtrn.32 q12, q13 // q12 = [31 30 11 10 21 20 01 00], q13 = [33 32 13 12 23 22 03 02]
+ vrev32.16 q13, q13 // q13 = [32 33 12 13 22 23 02 03]
+
+ vaddl.s16 q1, d24, d26 // q0 = [E21 E20 E01 E00]
+ vaddl.s16 q2, d25, d27 // q1 = [E31 E30 E11 E10]
+ vsubl.s16 q3, d24, d26 // q2 = [O21 O20 O01 O00]
+ vsubl.s16 q8, d25, d27 // q3 = [O31 O30 O11 O10]
+
+ vtrn.32 q1, q2 // q1 = [E30 E20 E10 E00], q2 = [E31 E21 E11 E01]
+ vtrn.32 q3, q8 // q3 = [O30 O20 O10 O00], q8 = [O31 O21 O11 O01]
+
+ vmul.s32 q9, q3, d0[0] // q9 = [83*O30 83*O20 83*O10 83*O00]
+ vmul.s32 q10, q8, d0[1] // q10 = [36*O31 36*O21 36*O11 36*O01]
+ vmul.s32 q11, q3, d0[1] // q11 = [36*O30 36*O20 36*O10 36*O00]
+ vmul.s32 q12, q8, d0[0] // q12 = [83*O31 83*O21 83*O11 83*O01]
+
+ vadd.s32 q0, q1, q2 // d0 = [E0 + E1]
+ vsub.s32 q1, q1, q2 // d1 = [E0 - E1]
+
+ vadd.s32 q9, q9, q10
+ vsub.s32 q10, q11, q12
+
+ vshl.s32 q0, q0, #6 // q1 = 64 * [0]
+ vshl.s32 q1, q1, #6 // q2 = 64 * [2]
+
+ vqrshrn.s32 d25, q9, 8 // d25 = R[13 12 11 10]
+ vqrshrn.s32 d27, q10, 8 // d27 = R[33 32 31 30]
+
+ vqrshrn.s32 d24, q0, 8 // d24 = R[03 02 01 00]
+ vqrshrn.s32 d26, q1, 8 // q26 = R[23 22 21 20]
+
+ vst1.16 {d24-d27}, [r1]
+
+ bx lr
+endfunc
+
diff -r 19cced21060f -r 0ca4769256c9 source/common/arm/dct8.h
--- a/source/common/arm/dct8.h Tue Apr 26 15:06:55 2016 -0700
+++ b/source/common/arm/dct8.h Tue Apr 26 11:44:56 2016 +0530
@@ -25,4 +25,6 @@
#ifndef X265_DCT8_ARM_H
#define X265_DCT8_ARM_H
+void PFX(dct_4x4_neon)(const int16_t* src, int16_t* dst, intptr_t srcStride);
+
#endif // ifndef X265_DCT8_ARM_H
More information about the x265-devel
mailing list