[x265] [PATCH 1 of 2 Update 3] asm: assembly code for quant
Min Chen
chenm003 at 163.com
Wed Nov 20 08:20:56 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384932029 -28800
# Node ID 5883934a91ddf09502c94f8a340162e19da1311c
# Parent 108ddc9e5c6b15e758ccbf08a0e923cbb7b28b5e
asm: assembly code for quant
diff -r 108ddc9e5c6b -r 5883934a91dd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Nov 19 23:45:52 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp Wed Nov 20 15:20:29 2013 +0800
@@ -635,6 +635,7 @@
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;
p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
+ p.quant = x265_quant_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 108ddc9e5c6b -r 5883934a91dd source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Tue Nov 19 23:45:52 2013 -0600
+++ b/source/common/x86/pixel-util.asm Wed Nov 20 15:20:29 2013 +0800
@@ -26,9 +26,12 @@
SECTION_RODATA 32
+c_d_4: dd 4, 4, 4, 4
+c_d_1234: dd 1, 2, 3, 4
+
+
SECTION .text
-
;-----------------------------------------------------------------------------
; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
;-----------------------------------------------------------------------------
@@ -549,3 +552,121 @@
jnz .loop
RET
+
+
+;-----------------------------------------------------------------------------
+; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64 == 1
+cglobal quant, 5,6,11
+ %define addVec m8
+ %define qbits m9
+ %define qbits8 m10
+%else
+cglobal quant, 5,6,8, 0-(3*mmsize)
+ %define addVec [rsp + 0 * mmsize]
+ %define qbits [rsp + 1 * mmsize]
+ %define qbits8 [rsp + 2 * mmsize]
+%endif
+
+ ; fill qbits-8
+ movd m0, r4d
+ mova qbits, m0
+
+ ; fill qbits-8
+ sub r4d, 8
+ movd m0, r4d
+ mova qbits8, m0
+
+ ; fill offset
+ mov r4d, r5m
+ movd m0, r4d
+ pshufd m0, m0, 0
+ mova addVec, m0
+
+ mov r4d, r6m
+ shr r4d, 3
+ pxor m7, m7 ; m7 = acSum4
+ mova m6, [c_d_1234] ; m6 = last4
+ pxor m5, m5 ; m5 = count
+ mova m4, [c_d_4] ; m4 = [4 4 4 4]
+.loop:
+ ; 4 coeff
+ movu m0, [r0] ; m1 = level
+ pxor m1, m1
+ pcmpgtd m1, m0 ; m2 = sign
+ movu m2, [r1] ; m3 = qcoeff
+ pabsd m0, m0
+ pmulld m0, m2 ; m1 = tmpLevel1
+ paddd m2, m0, addVec
+ psrad m2, qbits ; m3 = level1
+ paddd m7, m2
+ pslld m3, m2, qbits
+ psubd m0, m3
+ psrad m0, qbits8 ; m1 = deltaU1
+ movu [r2], m0
+
+ pxor m0, m0
+ pcmpeqd m0, m2 ; m0 = mask4
+ pand m5, m0
+ pandn m0, m6
+ por m5, m0
+ paddd m6, m4
+
+ pxor m2, m1
+ psubd m2, m1
+ packssdw m2, m2
+ pmovsxwd m2, m2
+ movu [r3], m2
+
+ ; 4 coeff
+ movu m0, [r0 + 16] ; m1 = level
+ pxor m1, m1
+ pcmpgtd m1, m0 ; m2 = sign
+ movu m2, [r1 + 16] ; m3 = qcoeff
+ pabsd m0, m0
+ pmulld m0, m2 ; m1 = tmpLevel1
+ paddd m2, m0, addVec
+ psrad m2, qbits ; m3 = level1
+ paddd m7, m2
+ pslld m3, m2, qbits
+ psubd m0, m3
+ psrad m0, qbits8 ; m1 = deltaU1
+ movu [r2 + 16], m0
+
+ pxor m0, m0
+ pcmpeqd m0, m2 ; m0 = mask4
+ pand m5, m0
+ pandn m0, m6
+ por m5, m0
+ paddd m6, m4
+
+ pxor m2, m1
+ psubd m2, m1
+ packssdw m2, m2
+ pmovsxwd m2, m2
+ movu [r3 + 16], m2
+
+ add r0, 32
+ add r1, 32
+ add r2, 32
+ add r3, 32
+
+ dec r4d
+ jnz .loop
+
+ movhlps m4, m5
+ pmaxud m4, m5
+ pshufd m5, m4, 1
+ pmaxud m4, m5
+
+ mov r4, r7m
+ movd [r4], m4
+ dec dword [r4]
+
+ phaddd m7, m7
+ phaddd m7, m7
+ movd eax, m7
+
+ RET
diff -r 108ddc9e5c6b -r 5883934a91dd source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Tue Nov 19 23:45:52 2013 -0600
+++ b/source/common/x86/pixel.h Wed Nov 20 15:20:29 2013 +0800
@@ -367,5 +367,6 @@
void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
void x265_transpose8_sse2(pixel *dest, pixel *src, intptr_t stride);
void x265_transpose16_sse2(pixel *dest, pixel *src, intptr_t stride);
+uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
#endif // ifndef X265_I386_PIXEL_H
diff -r 108ddc9e5c6b -r 5883934a91dd source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Tue Nov 19 23:45:52 2013 -0600
+++ b/source/test/mbdstharness.cpp Wed Nov 20 15:20:29 2013 +0800
@@ -30,6 +30,7 @@
#include <stdio.h>
using namespace x265;
+#define ITERS 100
struct DctConf_t
{
@@ -245,7 +246,7 @@
mintbuf2[i] = rand() & PIXEL_MAX;
}
- for (int i = 0; i <= 5; i++)
+ for (int i = 0; i <= ITERS; i++)
{
int width = (rand() % 4 + 1) * 4;
@@ -282,10 +283,10 @@
j += 16;
#if _DEBUG
- memset(mintbuf3, 0, mem_cmp_size);
- memset(mintbuf4, 0, mem_cmp_size);
- memset(mintbuf5, 0, mem_cmp_size);
- memset(mintbuf6, 0, mem_cmp_size);
+ memset(mintbuf3, 0xCD, mem_cmp_size);
+ memset(mintbuf4, 0xCD, mem_cmp_size);
+ memset(mintbuf5, 0xCD, mem_cmp_size);
+ memset(mintbuf6, 0xCD, mem_cmp_size);
#endif
}
More information about the x265-devel
mailing list