[x265-commits] [x265] encoder: nits and alloc zero fix
Steve Borho
steve at borho.org
Fri Nov 21 00:54:22 CET 2014
details: http://hg.videolan.org/x265/rev/80dcd3dfb805
branches:
changeset: 8871:80dcd3dfb805
user: Steve Borho <steve at borho.org>
date: Thu Nov 20 12:49:51 2014 -0600
description:
encoder: nits and alloc zero fix
intraData needs to be zerod on allocation else if one of the later allocs failed
some of the pointers will be uninitialized and passed to X265_FREE()
Subject: [x265] asm: luma_vpp[8x8] in avx2: improve 701c->387c
details: http://hg.videolan.org/x265/rev/562c43f738e4
branches:
changeset: 8872:562c43f738e4
user: Divya Manivannan <divya at multicorewareinc.com>
date: Thu Nov 20 12:23:05 2014 +0530
description:
asm: luma_vpp[8x8] in avx2: improve 701c->387c
Subject: [x265] asm: luma_vpp[8x4] in avx2: improve 498c->257c
details: http://hg.videolan.org/x265/rev/c2fd1b7d5d99
branches:
changeset: 8873:c2fd1b7d5d99
user: Divya Manivannan <divya at multicorewareinc.com>
date: Thu Nov 20 13:26:42 2014 +0530
description:
asm: luma_vpp[8x4] in avx2: improve 498c->257c
Subject: [x265] asm: luma_vpp[16x16] in avx2: improve 2141c->1284c
details: http://hg.videolan.org/x265/rev/2a2142982602
branches:
changeset: 8874:2a2142982602
user: Divya Manivannan <divya at multicorewareinc.com>
date: Thu Nov 20 13:47:51 2014 +0530
description:
asm: luma_vpp[16x16] in avx2: improve 2141c->1284c
Subject: [x265] Updated intrinsic of idct8 sse3 for new input format
details: http://hg.videolan.org/x265/rev/2abf89f5c4f2
branches:
changeset: 8875:2abf89f5c4f2
user: David T Yuen <dtyx265 at gmail.com>
date: Thu Nov 20 14:31:04 2014 -0800
description:
Updated intrinsic of idct8 sse3 for new input format
diffstat:
source/common/vec/dct-sse3.cpp | 34 +-
source/common/x86/asm-primitives.cpp | 3 +
source/common/x86/ipfilter8.asm | 491 +++++++++++++++++++++++++++++++++++
source/encoder/encoder.cpp | 9 +-
4 files changed, 508 insertions(+), 29 deletions(-)
diffs (truncated from 656 to 300 lines):
diff -r 1d17ec0cb954 -r 2abf89f5c4f2 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/vec/dct-sse3.cpp Thu Nov 20 14:31:04 2014 -0800
@@ -59,23 +59,15 @@ void idct8(const int16_t* src, int16_t*
m128iAdd = _mm_set1_epi32(64);
- T00 = _mm_load_si128((__m128i*)&src[8 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[8 + 4]);
- m128iS1 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[24 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[24 + 4]);
- m128iS3 = _mm_packs_epi32(T00, T01);
+ m128iS1 = _mm_load_si128((__m128i*)&src[8 + 0]);
+ m128iS3 = _mm_load_si128((__m128i*)&src[24 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
E1l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
E1h = _mm_madd_epi16(m128Tmp1, _mm_load_si128((__m128i*)(tab_idct_8x8[0])));
- T00 = _mm_load_si128((__m128i*)&src[40 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[40 + 4]);
- m128iS5 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[56 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[56 + 4]);
- m128iS7 = _mm_packs_epi32(T00, T01);
+ m128iS5 = _mm_load_si128((__m128i*)&src[40 + 0]);
+ m128iS7 = _mm_load_si128((__m128i*)&src[56 + 0]);
m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
E2l = _mm_madd_epi16(m128Tmp2, _mm_load_si128((__m128i*)(tab_idct_8x8[1])));
m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
@@ -107,12 +99,8 @@ void idct8(const int16_t* src, int16_t*
/* ------- */
- T00 = _mm_load_si128((__m128i*)&src[0 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[0 + 4]);
- m128iS0 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[32 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[32 + 4]);
- m128iS4 = _mm_packs_epi32(T00, T01);
+ m128iS0 = _mm_load_si128((__m128i*)&src[0 + 0]);
+ m128iS4 = _mm_load_si128((__m128i*)&src[32 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
EE0l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[8])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
@@ -123,12 +111,8 @@ void idct8(const int16_t* src, int16_t*
/* ------- */
- T00 = _mm_load_si128((__m128i*)&src[16 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[16 + 4]);
- m128iS2 = _mm_packs_epi32(T00, T01);
- T00 = _mm_load_si128((__m128i*)&src[48 + 0]);
- T01 = _mm_load_si128((__m128i*)&src[48 + 4]);
- m128iS6 = _mm_packs_epi32(T00, T01);
+ m128iS2 = _mm_load_si128((__m128i*)&src[16 + 0]);
+ m128iS6 = _mm_load_si128((__m128i*)&src[48 + 0]);
m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
E00l = _mm_madd_epi16(m128Tmp0, _mm_load_si128((__m128i*)(tab_idct_8x8[10])));
m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
@@ -1418,7 +1402,7 @@ void Setup_Vec_DCTPrimitives_sse3(Encode
* still somewhat rare on end-user PCs we still compile and link these SSE3
* intrinsic SIMD functions */
#if !HIGH_BIT_DEPTH
-// p.idct[IDCT_8x8] = idct8;
+ p.idct[IDCT_8x8] = idct8;
p.idct[IDCT_16x16] = idct16;
p.idct[IDCT_32x32] = idct32;
#endif
diff -r 1d17ec0cb954 -r 2abf89f5c4f2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 20 14:31:04 2014 -0800
@@ -1798,9 +1798,12 @@ void Setup_Assembly_Primitives(EncoderPr
p.transpose[BLOCK_16x16] = x265_transpose16_avx2;
p.transpose[BLOCK_32x32] = x265_transpose32_avx2;
p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
+ p.luma_vpp[LUMA_16x16] = x265_interp_8tap_vert_pp_16x16_avx2;
#endif
p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
+ p.luma_vpp[LUMA_8x4] = x265_interp_8tap_vert_pp_8x4_avx2;
+ p.luma_vpp[LUMA_8x8] = x265_interp_8tap_vert_pp_8x8_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r 1d17ec0cb954 -r 2abf89f5c4f2 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Nov 20 14:31:04 2014 -0800
@@ -122,6 +122,27 @@ tab_LumaCoeffVer: times 8 db 0, 0
times 8 db 58, -10
times 8 db 4, -1
+ALIGN 32
+tab_LumaCoeffVer_32: times 16 db 0, 0
+ times 16 db 0, 64
+ times 16 db 0, 0
+ times 16 db 0, 0
+
+ times 16 db -1, 4
+ times 16 db -10, 58
+ times 16 db 17, -5
+ times 16 db 1, 0
+
+ times 16 db -1, 4
+ times 16 db -11, 40
+ times 16 db 40, -11
+ times 16 db 4, -1
+
+ times 16 db 0, 1
+ times 16 db -5, 17
+ times 16 db 58, -10
+ times 16 db 4, -1
+
tab_c_64_n64: times 8 db 64, -64
const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
@@ -3532,6 +3553,122 @@ FILTER_VER_LUMA_4xN 4, 8, ps
;-------------------------------------------------------------------------------------------------------------
FILTER_VER_LUMA_4xN 4, 16, ps
+%macro PROCESS_LUMA_AVX2_W8_8R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ pmaddubsw m1, [r5]
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m4, [r5 + 1 * mmsize]
+ paddw m1, m3
+ pmaddubsw m4, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+ pmaddubsw m3, m0, [r5 + 2 * mmsize]
+ paddw m1, m3
+ pmaddubsw m0, [r5 + 1 * mmsize]
+ paddw m4, m0
+
+ movq xm3, [r0 + r4] ; m3 = row 11
+ punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 12
+ punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0]
+ vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0]
+ pmaddubsw m3, m6, [r5 + 3 * mmsize]
+ paddw m1, m3
+ pmaddubsw m6, [r5 + 2 * mmsize]
+ paddw m4, m6
+ movq xm3, [r0 + r1] ; m3 = row 13
+ punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 14
+ punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0]
+ pmaddubsw m0, [r5 + 3 * mmsize]
+ paddw m4, m0
+%endmacro
+
+%macro PROCESS_LUMA_AVX2_W8_4R 0
+ movq xm1, [r0] ; m1 = row 0
+ movq xm2, [r0 + r1] ; m2 = row 1
+ punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ movq xm3, [r0 + r1 * 2] ; m3 = row 2
+ punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10]
+ vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00]
+ pmaddubsw m5, [r5]
+ movq xm4, [r0 + r4] ; m4 = row 3
+ punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ lea r0, [r0 + r1 * 4]
+ movq xm1, [r0] ; m1 = row 4
+ punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30]
+ vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20]
+ pmaddubsw m0, m2, [r5 + 1 * mmsize]
+ paddw m5, m0
+ pmaddubsw m2, [r5]
+ movq xm3, [r0 + r1] ; m3 = row 5
+ punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ movq xm4, [r0 + r1 * 2] ; m4 = row 6
+ punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50]
+ vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40]
+ pmaddubsw m3, m1, [r5 + 2 * mmsize]
+ paddw m5, m3
+ pmaddubsw m0, m1, [r5 + 1 * mmsize]
+ paddw m2, m0
+ movq xm3, [r0 + r4] ; m3 = row 7
+ punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ lea r0, [r0 + r1 * 4]
+ movq xm0, [r0] ; m0 = row 8
+ punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70]
+ vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60]
+ pmaddubsw m3, m4, [r5 + 3 * mmsize]
+ paddw m5, m3
+ pmaddubsw m3, m4, [r5 + 2 * mmsize]
+ paddw m2, m3
+ movq xm3, [r0 + r1] ; m3 = row 9
+ punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ movq xm6, [r0 + r1 * 2] ; m6 = row 10
+ punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90]
+ vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80]
+ pmaddubsw m3, m0, [r5 + 3 * mmsize]
+ paddw m2, m3
+%endmacro
+
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------
@@ -3601,6 +3738,69 @@ cglobal interp_8tap_vert_%3_%1x%2, 5, 7,
RET
%endmacro
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x8, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ PROCESS_LUMA_AVX2_W8_8R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ pmulhrsw m1, m3 ; m1 = word: row 4, row 5
+ pmulhrsw m4, m3 ; m4 = word: row 6, row 7
+ packuswb m5, m2
+ packuswb m1, m4
+ vextracti128 xm2, m5, 1
+ vextracti128 xm4, m1, 1
+ movq [r2], xm5
+ movq [r2 + r3], xm2
+ movhps [r2 + r3 * 2], xm5
+ movhps [r2 + r4], xm2
+ lea r2, [r2 + r3 * 4]
+ movq [r2], xm1
+ movq [r2 + r3], xm4
+ movhps [r2 + r3 * 2], xm1
+ movhps [r2 + r4], xm4
+ RET
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_8x4, 4, 6, 7
+ mov r4d, r4m
+ shl r4d, 7
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer_32]
+ add r5, r4
+%else
+ lea r5, [tab_LumaCoeffVer_32 + r4]
+%endif
+
+ lea r4, [r1 * 3]
+ sub r0, r4
+ PROCESS_LUMA_AVX2_W8_4R
+ lea r4, [r3 * 3]
+ mova m3, [pw_512]
+ pmulhrsw m5, m3 ; m5 = word: row 0, row 1
+ pmulhrsw m2, m3 ; m2 = word: row 2, row 3
+ packuswb m5, m2
More information about the x265-commits
mailing list