[x265] [PATCH] arm: Implement dequant_scaling ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Fri Apr 15 09:13:30 CEST 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1460700579 -19800
# Fri Apr 15 11:39:39 2016 +0530
# Node ID 5db26e0f305c09cffa1df1557029f7021ae5d7bd
# Parent 54f6bc901448039b2f5523531bda1f2351b6103f
arm: Implement dequant_scaling ARM NEON
diff -r 54f6bc901448 -r 5db26e0f305c source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Fri Apr 15 11:39:39 2016 +0530
@@ -43,6 +43,8 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // dequant_scaling
+ p.dequant_scaling = PFX(dequant_scaling_neon);
// luma satd
p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_neon);
p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_neon);
diff -r 54f6bc901448 -r 5db26e0f305c source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.S Fri Apr 15 11:39:39 2016 +0530
@@ -1962,3 +1962,68 @@
bx lr
endfunc
+/***** dequant_scaling*****/
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
+function x265_dequant_scaling_neon
+ push {r4, r5, r6, r7}
+ ldr r4, [sp, #16] // per
+ ldr r5, [sp, #20] //.shift
+ add r5, #4 // shift + 4
+ lsr r3, #3 // num / 8
+ cmp r5, r4
+ blt skip
+
+ mov r12, #1
+ sub r6, r5, r4 // shift - per
+ sub r6, #1 // shift - per - 1
+ lsl r6, r12, r6 // 1 << shift - per - 1 (add)
+ vdup.32 q0, r6
+ sub r7, r4, r5 // per - shift
+ vdup.32 q3, r7
+
+dequant_loop1:
+ vld1.16 {q9}, [r0]! // quantCoef
+ vld1.32 {q2}, [r1]! // deQuantCoef
+ vld1.32 {q10}, [r1]!
+ vmovl.s16 q1, d18
+ vmovl.s16 q9, d19
+
+ vmul.s32 q1, q2 // quantCoef * deQuantCoef
+ vmul.s32 q9, q10
+ vadd.s32 q1, q0 // quantCoef * deQuantCoef + add
+ vadd.s32 q9, q0
+
+ vshl.s32 q1, q3
+ vshl.s32 q9, q3
+ vqmovn.s32 d16, q1 // x265_clip3
+ vqmovn.s32 d17, q9
+ subs r3, #1
+ vst1.16 {q8}, [r2]!
+ bne dequant_loop1
+ b 1f
+
+skip:
+ sub r6, r4, r5 // per - shift
+ vdup.16 q0, r6
+
+dequant_loop2:
+ vld1.16 {q9}, [r0]! // quantCoef
+ vld1.32 {q2}, [r1]! // deQuantCoef
+ vld1.32 {q10}, [r1]!
+ vmovl.s16 q1, d18
+ vmovl.s16 q9, d19
+
+ vmul.s32 q1, q2 // quantCoef * deQuantCoef
+ vmul.s32 q9, q10
+ vqmovn.s32 d16, q1 // x265_clip3
+ vqmovn.s32 d17, q9
+
+ vqshl.s16 d16, d0 // coefQ << per - shift
+ vqshl.s16 d17, d0
+ subs r3, #1
+ vst1.16 {q8}, [r2]!
+ bne dequant_loop2
+1:
+ pop {r4, r5, r6, r7}
+ bx lr
+endfunc
diff -r 54f6bc901448 -r 5db26e0f305c source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.h Fri Apr 15 11:39:39 2016 +0530
@@ -78,4 +78,6 @@
int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
#endif // ifndef X265_PIXEL_UTIL_ARM_H
More information about the x265-devel
mailing list