[x265-commits] [x265] asm: avx2 code chroma vss filter for i422
Sumalatha at videolan.org
Sumalatha at videolan.org
Thu Apr 23 19:49:58 CEST 2015
details: http://hg.videolan.org/x265/rev/b10f23e72fe1
branches:
changeset: 10265:b10f23e72fe1
user: Sumalatha Polureddy
date: Thu Apr 23 12:20:44 2015 +0530
description:
asm: avx2 code chroma vss filter for i422
Subject: [x265] asm: avx2 code for chroma vss filter for i444
details: http://hg.videolan.org/x265/rev/c9ee14f8a634
branches:
changeset: 10266:c9ee14f8a634
user: Sumalatha Polureddy
date: Thu Apr 23 15:17:45 2015 +0530
description:
asm: avx2 code for chroma vss filter for i444
Subject: [x265] sao: modify saoCuOrgE2 primitive to handle width=16 separately
details: http://hg.videolan.org/x265/rev/a4322c920dbd
branches:
changeset: 10267:a4322c920dbd
user: Divya Manivannan <divya at multicorewareinc.com>
date: Thu Apr 23 18:13:13 2015 +0530
description:
sao: modify saoCuOrgE2 primitive to handle width=16 separately
Subject: [x265] asm: saoCuOrgE2[0] avx2 code: improve 154c->128c
details: http://hg.videolan.org/x265/rev/da80cd99cb5e
branches:
changeset: 10268:da80cd99cb5e
user: Divya Manivannan <divya at multicorewareinc.com>
date: Thu Apr 23 18:40:34 2015 +0530
description:
asm: saoCuOrgE2[0] avx2 code: improve 154c->128c
Subject: [x265] asm: saoCuOrgE2[1] avx2 code: improve 449c->292c
details: http://hg.videolan.org/x265/rev/55ac6ad82533
branches:
changeset: 10269:55ac6ad82533
user: Divya Manivannan <divya at multicorewareinc.com>
date: Thu Apr 23 18:48:23 2015 +0530
description:
asm: saoCuOrgE2[1] avx2 code: improve 449c->292c
Subject: [x265] asm: avx2 10bit code for sub_ps[16x16],[32x32],[64x64]
details: http://hg.videolan.org/x265/rev/15ca744d33c5
branches:
changeset: 10270:15ca744d33c5
user: Rajesh Paulraj<rajesh at multicorewareinc.com>
date: Thu Apr 23 19:07:34 2015 +0530
description:
asm: avx2 10bit code for sub_ps[16x16],[32x32],[64x64]
sub_ps[16x16](13.23x), sub_ps[32x32](16.41x), sub_ps[64x64](16.96x)
Subject: [x265] asm: avx2 10bit code for sub_ps for chroma sizes 16xN, 32xN, reuse luma code
details: http://hg.videolan.org/x265/rev/2a247597171d
branches:
changeset: 10271:2a247597171d
user: Rajesh Paulraj<rajesh at multicorewareinc.com>
date: Thu Apr 23 19:10:41 2015 +0530
description:
asm: avx2 10bit code for sub_ps for chroma sizes 16xN, 32xN, reuse luma code
Subject: [x265] smoke-test: set a higher VBV tolerance for the smoke test
details: http://hg.videolan.org/x265/rev/a35fafa25df2
branches:
changeset: 10272:a35fafa25df2
user: Steve Borho <steve at borho.org>
date: Thu Apr 23 12:32:49 2015 -0500
description:
smoke-test: set a higher VBV tolerance for the smoke test
we don't want smoke tests to be considered "failed" unless the VBV bitrate
changes substantially
diffstat:
source/common/loopfilter.cpp | 3 +-
source/common/primitives.h | 2 +-
source/common/x86/asm-primitives.cpp | 47 +++++++-
source/common/x86/loopfilter.asm | 84 +++++++++++++
source/common/x86/loopfilter.h | 2 +
source/common/x86/pixel-util8.asm | 215 +++++++++++++++++++++++++++++++++++
source/encoder/sao.cpp | 4 +-
source/test/pixelharness.cpp | 60 ++++++---
source/test/pixelharness.h | 2 +-
source/test/smoke-tests.txt | 4 +
10 files changed, 394 insertions(+), 29 deletions(-)
diffs (truncated from 618 to 300 lines):
diff -r cec68d3e37ef -r a35fafa25df2 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/loopfilter.cpp Thu Apr 23 12:32:49 2015 -0500
@@ -144,7 +144,8 @@ void setupLoopFilterPrimitives_c(Encoder
p.saoCuOrgE0 = processSaoCUE0;
p.saoCuOrgE1 = processSaoCUE1;
p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
- p.saoCuOrgE2 = processSaoCUE2;
+ p.saoCuOrgE2[0] = processSaoCUE2;
+ p.saoCuOrgE2[1] = processSaoCUE2;
p.saoCuOrgE3[0] = processSaoCUE3;
p.saoCuOrgE3[1] = processSaoCUE3;
p.saoCuOrgB0 = processSaoCUB0;
diff -r cec68d3e37ef -r a35fafa25df2 source/common/primitives.h
--- a/source/common/primitives.h Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/primitives.h Thu Apr 23 12:32:49 2015 -0500
@@ -276,7 +276,7 @@ struct EncoderPrimitives
sign_t sign;
saoCuOrgE0_t saoCuOrgE0;
saoCuOrgE1_t saoCuOrgE1, saoCuOrgE1_2Rows;
- saoCuOrgE2_t saoCuOrgE2;
+ saoCuOrgE2_t saoCuOrgE2[2];
saoCuOrgE3_t saoCuOrgE3[2];
saoCuOrgB0_t saoCuOrgB0;
diff -r cec68d3e37ef -r a35fafa25df2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 23 12:32:49 2015 -0500
@@ -1223,6 +1223,14 @@ void setupAssemblyPrimitives(EncoderPrim
ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
+ p.cu[BLOCK_16x16].sub_ps = x265_pixel_sub_ps_16x16_avx2;
+ p.cu[BLOCK_32x32].sub_ps = x265_pixel_sub_ps_32x32_avx2;
+ p.cu[BLOCK_64x64].sub_ps = x265_pixel_sub_ps_64x64_avx2;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = x265_pixel_sub_ps_16x16_avx2;
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = x265_pixel_sub_ps_32x32_avx2;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = x265_pixel_sub_ps_16x32_avx2;
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = x265_pixel_sub_ps_32x64_avx2;
+
p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2;
p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2;
p.pu[LUMA_16x12].convert_p2s = x265_filterPixelToShort_16x12_avx2;
@@ -1496,7 +1504,8 @@ void setupAssemblyPrimitives(EncoderPrim
p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
- p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
+ p.saoCuOrgE2[0] = x265_saoCuOrgE2_sse4;
+ p.saoCuOrgE2[1] = x265_saoCuOrgE2_sse4;
p.saoCuOrgE3[0] = x265_saoCuOrgE3_sse4;
p.saoCuOrgE3[1] = x265_saoCuOrgE3_sse4;
p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
@@ -1706,6 +1715,8 @@ void setupAssemblyPrimitives(EncoderPrim
p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
+ p.saoCuOrgE2[0] = x265_saoCuOrgE2_avx2;
+ p.saoCuOrgE2[1] = x265_saoCuOrgE2_32_avx2;
p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
@@ -2245,6 +2256,40 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = x265_interp_4tap_vert_ss_32x24_avx2;
+ //i422 for chroma_vss
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vss = x265_interp_4tap_vert_ss_4x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = x265_interp_4tap_vert_ss_8x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = x265_interp_4tap_vert_ss_16x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vss = x265_interp_4tap_vert_ss_4x4_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vss = x265_interp_4tap_vert_ss_2x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vss = x265_interp_4tap_vert_ss_8x8_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_vss = x265_interp_4tap_vert_ss_4x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = x265_interp_4tap_vert_ss_16x16_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = x265_interp_4tap_vert_ss_8x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = x265_interp_4tap_vert_ss_32x32_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vss = x265_interp_4tap_vert_ss_8x4_avx2;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
+
+ //i444 for chroma_vss
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vss = x265_interp_4tap_vert_ss_4x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = x265_interp_4tap_vert_ss_8x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = x265_interp_4tap_vert_ss_16x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = x265_interp_4tap_vert_ss_32x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = x265_interp_4tap_vert_ss_8x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vss = x265_interp_4tap_vert_ss_4x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = x265_interp_4tap_vert_ss_16x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = x265_interp_4tap_vert_ss_8x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = x265_interp_4tap_vert_ss_16x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = x265_interp_4tap_vert_ss_16x12_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vss = x265_interp_4tap_vert_ss_12x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = x265_interp_4tap_vert_ss_16x4_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_vss = x265_interp_4tap_vert_ss_4x16_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = x265_interp_4tap_vert_ss_32x24_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = x265_interp_4tap_vert_ss_8x32_avx2;
+
p.pu[LUMA_16x16].luma_hvpp = x265_interp_8tap_hv_pp_16x16_avx2;
p.pu[LUMA_32x8].convert_p2s = x265_filterPixelToShort_32x8_avx2;
diff -r cec68d3e37ef -r a35fafa25df2 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/x86/loopfilter.asm Thu Apr 23 12:32:49 2015 -0500
@@ -452,6 +452,90 @@ cglobal saoCuOrgE2, 5, 7, 8, rec, bufft,
jnz .loop
RET
+INIT_YMM avx2
+cglobal saoCuOrgE2, 5, 6, 6, rec, bufft, buff1, offsetEo, lcuWidth
+ mov r5d, r5m
+ pxor xm0, xm0 ; xm0 = 0
+ mova xm5, [pb_128]
+ inc r1
+
+ movu xm1, [r0] ; xm1 = rec[x]
+ movu xm2, [r0 + r5 + 1] ; xm2 = rec[x + stride + 1]
+ pxor xm3, xm1, xm5
+ pxor xm4, xm2, xm5
+ pcmpgtb xm2, xm3, xm4
+ pcmpgtb xm4, xm3
+ pand xm2, [pb_1]
+ por xm2, xm4
+ movu xm3, [r2] ; xm3 = buff1
+
+ paddb xm3, xm2
+ paddb xm3, [pb_2] ; xm3 = edgeType
+
+ movu xm4, [r3] ; xm4 = offsetEo
+ pshufb xm4, xm3
+
+ psubb xm3, xm0, xm2
+ movu [r1], xm3
+
+ pmovzxbw m2, xm1
+ pmovsxbw m3, xm4
+
+ paddw m2, m3
+ vextracti128 xm3, m2, 1
+ packuswb xm2, xm3
+ movu [r0], xm2
+ RET
+
+INIT_YMM avx2
+cglobal saoCuOrgE2_32, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
+ mov r5d, r5m
+ pxor m0, m0 ; m0 = 0
+ mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ vbroadcasti128 m7, [pb_128]
+ vbroadcasti128 m5, [r3] ; m5 = offsetEo
+ shr r4d, 5
+ inc r1
+
+.loop:
+ movu m1, [r0] ; m1 = rec[x]
+ movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1]
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
+ movu m3, [r2] ; m3 = buff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = edgeType
+
+ pshufb m4, m5, m3
+
+ psubb m3, m0, m2
+ movu [r1], m3
+
+ pmovzxbw m2, xm1
+ vextracti128 xm1, m1, 1
+ pmovzxbw m1, xm1
+ pmovsxbw m3, xm4
+ vextracti128 xm4, m4, 1
+ pmovsxbw m4, xm4
+
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ vpermq m2, m2, 11011000b
+ movu [r0], m2
+
+ add r0, 32
+ add r1, 32
+ add r2, 32
+ dec r4d
+ jnz .loop
+ RET
+
;=======================================================================================================
;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
;=======================================================================================================
diff -r cec68d3e37ef -r a35fafa25df2 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/x86/loopfilter.h Thu Apr 23 12:32:49 2015 -0500
@@ -32,6 +32,8 @@ void x265_saoCuOrgE1_avx2(pixel* rec, in
void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+void x265_saoCuOrgE2_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+void x265_saoCuOrgE2_32_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
diff -r cec68d3e37ef -r a35fafa25df2 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Apr 22 21:35:55 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Apr 23 12:32:49 2015 -0500
@@ -4560,6 +4560,54 @@ PIXELSUB_PS_W16_H4 16, 32
;-----------------------------------------------------------------------------
; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%macro PIXELSUB_PS_W16_H4_avx2 1
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_sub_ps_16x%1, 6, 9, 4, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1d, r1d
+ add r4d, r4d
+ add r5d, r5d
+ lea r6, [r1 * 3]
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+
+%rep %1/4
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + r4]
+ movu m3, [r3 + r5]
+
+ psubw m0, m1
+ psubw m2, m3
+
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + r4 * 2]
+ movu m1, [r3 + r5 * 2]
+ movu m2, [r2 + r7]
+ movu m3, [r3 + r8]
+
+ psubw m0, m1
+ psubw m2, m3
+
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r6], m2
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+%endrep
+ RET
+%endif
+%endmacro
+PIXELSUB_PS_W16_H4_avx2 16
+PIXELSUB_PS_W16_H4_avx2 32
+%else
+;-----------------------------------------------------------------------------
+; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
+;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W16_H8_avx2 2
%if ARCH_X86_64
INIT_YMM avx2
@@ -4632,6 +4680,7 @@ cglobal pixel_sub_ps_16x%2, 6, 10, 4, de
PIXELSUB_PS_W16_H8_avx2 16, 16
PIXELSUB_PS_W16_H8_avx2 16, 32
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
@@ -4770,6 +4819,74 @@ PIXELSUB_PS_W32_H2 32, 64
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%macro PIXELSUB_PS_W32_H4_avx2 1
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal pixel_sub_ps_32x%1, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1d, r1d
+ add r4d, r4d
+ add r5d, r5d
+ mov r9d, %1/4
+ lea r6, [r1 * 3]
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + 32]
+ movu m2, [r3]
+ movu m3, [r3 + 32]
+ psubw m0, m2
+ psubw m1, m3
+
More information about the x265-commits
mailing list