[x265] [PATCH] asm: intra_pred_dc32 high_bit_depth code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue May 12 08:17:32 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431408860 -19800
# Tue May 12 11:04:20 2015 +0530
# Node ID 5bf9096a4d93fd8c47e9c81e3dafba5d37391114
# Parent f2081ef64fd27dfd3a5bec92ee1a835a74061761
asm: intra_pred_dc32 high_bit_depth code
AVX2:
intra_dc_32x32 19.36x 780.92 15118.54
SSE:
intra_dc_32x32 10.41x 1457.15 15167.84
diff -r f2081ef64fd2 -r 5bf9096a4d93 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon May 11 18:50:03 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Tue May 12 11:04:20 2015 +0530
@@ -1181,6 +1181,8 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_avx2;
+
p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;
p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_avx2;
diff -r f2081ef64fd2 -r 5bf9096a4d93 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon May 11 18:50:03 2015 -0500
+++ b/source/common/x86/intrapred16.asm Tue May 12 11:04:20 2015 +0530
@@ -448,6 +448,106 @@
%endrep
RET
+;---------------------------------------------------------------------------------------------
+; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
+;---------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_dc32, 3, 3, 2
+ add r2, 2
+ add r1d, r1d
+ movu m0, [r2]
+ movu m1, [r2 + 32]
+ add r2, mmsize*4 ; r2 += 128
+ paddw m0, m1
+ movu m1, [r2]
+ paddw m0, m1
+ movu m1, [r2 + 32]
+ paddw m0, m1
+ vextracti128 xm1, m0, 1
+ paddw xm0, xm1
+ movhlps xm1, xm0
+ paddw xm0, xm1
+ phaddw xm0, xm0
+ pmaddwd xm0, [pw_1]
+ paddd xm0, [pd_32] ; sum = sum + 32
+ psrld xm0, 6 ; sum = sum / 64
+ vpbroadcastw m0, xm0
+
+ lea r2, [r1 * 3]
+ ; store DC 32x32
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ lea r0, [r0 + r1 * 4]
+ movu [r0 + r1 * 0 + 0], m0
+ movu [r0 + r1 * 0 + mmsize], m0
+ movu [r0 + r1 * 1 + 0], m0
+ movu [r0 + r1 * 1 + mmsize], m0
+ movu [r0 + r1 * 2 + 0], m0
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r2 * 1 + 0], m0
+ movu [r0 + r2 * 1 + mmsize], m0
+ RET
+
;---------------------------------------------------------------------------------------
; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter)
;---------------------------------------------------------------------------------------
More information about the x265-devel
mailing list