[x265] [PATCH] asm: avx2 code for high_bit_depth intra_dc_16x16
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue May 12 08:18:53 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431409071 -19800
# Tue May 12 11:07:51 2015 +0530
# Node ID ac2832c459edc3e1417d6fb62f89203e23484ec8
# Parent 5bf9096a4d93fd8c47e9c81e3dafba5d37391114
asm: avx2 code for high_bit_depth intra_dc_16x16
AVX2:
intra_dc_16x16[filter=0] 18.78x 231.65 4350.17
intra_dc_16x16[filter=1] 10.76x 467.37 5028.99
SSE:
intra_dc_16x16[filter=0] 9.46x 459.22 4345.05
intra_dc_16x16[filter=1] 7.19x 692.54 4976.13
diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue May 12 11:07:51 2015 +0530
@@ -1181,6 +1181,7 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_avx2;
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;
diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/const-a.asm Tue May 12 11:07:51 2015 +0530
@@ -62,7 +62,7 @@
;; 16-bit constants
const pw_1, times 16 dw 1
-const pw_2, times 8 dw 2
+const pw_2, times 16 dw 2
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/intrapred.h Tue May 12 11:07:51 2015 +0530
@@ -34,6 +34,7 @@
void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
+void x265_intra_pred_dc16_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_dc32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter);
void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int);
diff -r 5bf9096a4d93 -r ac2832c459ed source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue May 12 11:04:20 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue May 12 11:07:51 2015 +0530
@@ -448,6 +448,118 @@
%endrep
RET
+;-------------------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
+;-------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_dc16, 3, 9, 4
+ mov r3d, r4m
+ add r1d, r1d
+ movu m0, [r2 + 66]
+ movu m2, [r2 + 2]
+ paddw m0, m2
+
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ phaddw xm0, xm0
+ pmaddwd xm0, [pw_1]
+ paddd xm0, [pd_16]
+ psrad xm0, 5
+ movd r5d, xm0
+ vpbroadcastw m0, xm0
+
+ test r3d, r3d
+
+ ; store DC 16x16
+ lea r6, [r1 + r1 * 2] ; index 3
+ lea r7, [r1 + r1 * 4] ; index 5
+ lea r8, [r6 + r1 * 4] ; index 7
+ lea r4, [r0 + r8 * 1] ; base + 7
+
+ movu [r0], m0
+ movu [r0 + r1], m0
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r6], m0
+ movu [r0 + r1 * 4], m0
+ movu [r0 + r7], m0
+ movu [r0 + r6 * 2], m0
+ movu [r4], m0
+ movu [r0 + r1 * 8], m0
+ movu [r4 + r1 * 2], m0
+ movu [r0 + r7 * 2], m0
+ movu [r4 + r1 * 4], m0
+ movu [r0 + r6 * 4], m0
+ movu [r4 + r6 * 2], m0
+ movu [r4 + r8], m0
+ movu [r4 + r1 * 8], m0
+
+ ; Do DC Filter
+ jz .end
+ mova m1, [pw_2]
+ pmullw m1, m0
+ paddw m1, [pw_2]
+ movd r3d, xm1
+ paddw m1, m0
+
+ ; filter top
+ movu m2, [r2 + 2]
+ paddw m2, m1
+ psraw m2, 2
+ movu [r0], m2
+
+ ; filter top-left
+ movzx r3d, r3w
+ movzx r5d, word [r2 + 66]
+ add r3d, r5d
+ movzx r5d, word [r2 + 2]
+ add r5d, r3d
+ shr r5d, 2
+ mov [r0], r5w
+
+ ; filter left
+ movu m2, [r2 + 68]
+ paddw m2, m1
+ psraw m2, 2
+ vextracti128 xm3, m2, 1
+
+ movq r3, xm2
+ pshufd xm2, xm2, 0xEE
+ mov [r0 + r1], r3w
+ shr r3, 16
+ mov [r0 + r1 * 2], r3w
+ shr r3, 16
+ mov [r0 + r6], r3w
+ shr r3, 16
+ mov [r0 + r1 * 4], r3w
+ movq r3, xm2
+ mov [r0 + r7], r3w
+ shr r3, 16
+ mov [r0 + r6 * 2], r3w
+ shr r3, 16
+ mov [r4], r3w
+ shr r3, 16
+ mov [r0 + r1 * 8], r3w
+
+ movq r3, xm3
+ pshufd xm3, xm3, 0xEE
+ mov [r4 + r1 * 2], r3w
+ shr r3, 16
+ mov [r0 + r7 * 2], r3w
+ shr r3, 16
+ mov [r4 + r1 * 4], r3w
+ shr r3, 16
+ mov [r0 + r6 * 4], r3w
+ movq r3, xm3
+ mov [r4 + r6 * 2], r3w
+ shr r3, 16
+ mov [r4 + r8], r3w
+ shr r3, 16
+ mov [r4 + r1 * 8], r3w
+.end:
+ RET
+
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list