[x265] [PATCH 4 of 4] asm: AVX2 version sa8d[16x16], 1913c(AVX) -> 1620c(AVX2)
Min Chen
chenm003 at 163.com
Thu May 14 01:53:20 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1431561179 25200
# Node ID 851b3d841543a59796361f83d2cdeef976c35549
# Parent 10e71f621d70b0185deeddae9974df54ebc63dc4
asm: AVX2 version sa8d[16x16], 1913c(AVX) -> 1620c(AVX2)
---
source/common/x86/asm-primitives.cpp | 2 +
source/common/x86/pixel-a.asm | 37 ++++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 0 deletions(-)
diff -r 10e71f621d70 -r 851b3d841543 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed May 13 16:52:56 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Wed May 13 16:52:59 2015 -0700
@@ -1941,7 +1941,9 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = x265_addAvg_32x64_avx2;
p.cu[BLOCK_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
+ p.cu[BLOCK_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = x265_pixel_sa8d_8x8_avx2;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = x265_pixel_sa8d_16x16_avx2;
p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2;
p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2;
diff -r 10e71f621d70 -r 851b3d841543 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed May 13 16:52:56 2015 -0700
+++ b/source/common/x86/pixel-a.asm Wed May 13 16:52:59 2015 -0700
@@ -6982,6 +6982,43 @@
add eax, 1
shr eax, 1
RET
+
+cglobal pixel_sa8d_16x16, 4,6,8
+ SATD_START_AVX2 m6, m7, 1
+
+ call pixel_sa8d_8x8_internal ; pix[0]
+
+ sub r0, r1
+ sub r0, r1
+ add r0, 8*SIZEOF_PIXEL
+ sub r2, r3
+ sub r2, r3
+ add r2, 8*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal ; pix[8]
+
+ add r0, r4
+ add r0, r1
+ add r2, r5
+ add r2, r3
+ call pixel_sa8d_8x8_internal ; pix[8*stride+8]
+
+ sub r0, r1
+ sub r0, r1
+ sub r0, 8*SIZEOF_PIXEL
+ sub r2, r3
+ sub r2, r3
+ sub r2, 8*SIZEOF_PIXEL
+ call pixel_sa8d_8x8_internal ; pix[8*stride]
+
+ ; TODO: analyze Dynamic Range
+ vextracti128 xm0, m6, 1
+ paddusw xm6, xm0
+ HADDUW xm6, xm0
+ movd eax, xm6
+ add eax, 1
+ shr eax, 1
+ RET
+
%endif ; HIGH_BIT_DEPTH
; Input 16bpp, Output 8bpp
More information about the x265-devel
mailing list