[x265] [PATCH] asm: intrapred_angX_4x4 sse2 performance tweaks 10-bit
dtyx265 at gmail.com
dtyx265 at gmail.com
Mon Jun 22 05:54:03 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1434945136 25200
# Node ID 99018e7df815e0c85f8477938fd7cf59d9610317
# Parent b870e819ade1c9f197766318ffa7d96814dbb3cb
asm: intrapred_angX_4x4 sse2 performance tweaks 10-bit
Created individual primitives for angles 19-25 and 27-33 to allow
individual tweaking of each angle for about 5% performance improvement
intra_ang_4x4[ 3] 3.90x 487.44 1900.97
intra_ang_4x4[ 4] 4.51x 454.99 2050.33
intra_ang_4x4[ 5] 4.51x 455.00 2049.97
intra_ang_4x4[ 6] 4.82x 425.00 2049.97
intra_ang_4x4[ 7] 4.44x 427.50 1899.97
intra_ang_4x4[ 8] 4.71x 425.00 1999.97
intra_ang_4x4[ 9] 4.71x 425.00 1999.97
intra_ang_4x4[11] 4.76x 410.00 1951.26
intra_ang_4x4[12] 5.00x 410.00 2050.27
intra_ang_4x4[13] 4.48x 482.50 2160.44
intra_ang_4x4[14] 4.70x 462.50 2172.89
intra_ang_4x4[15] 4.57x 460.00 2100.26
intra_ang_4x4[16] 4.83x 455.00 2199.91
intra_ang_4x4[17] 3.96x 562.50 2230.17
intra_ang_4x4[19] 3.67x 475.00 1742.82
intra_ang_4x4[20] 4.32x 397.49 1715.35
intra_ang_4x4[21] 3.88x 402.49 1562.49
intra_ang_4x4[22] 4.08x 410.00 1672.74
intra_ang_4x4[23] 3.91x 415.00 1622.59
intra_ang_4x4[24] 4.09x 370.00 1513.66
intra_ang_4x4[25] 3.79x 372.50 1412.90
intra_ang_4x4[27] 4.00x 365.01 1460.97
intra_ang_4x4[28] 3.85x 380.01 1462.66
intra_ang_4x4[29] 3.73x 365.00 1359.97
intra_ang_4x4[30] 4.11x 367.50 1509.97
intra_ang_4x4[31] 4.00x 377.50 1509.97
intra_ang_4x4[32] 4.00x 377.50 1509.97
intra_ang_4x4[33] 3.44x 395.00 1359.97
diff -r b870e819ade1 -r 99018e7df815 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sun Jun 21 18:33:58 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Sun Jun 21 20:52:16 2015 -0700
@@ -977,21 +977,21 @@
p.cu[BLOCK_4x4].intra_pred[16] = PFX(intra_pred_ang4_16_sse2);
p.cu[BLOCK_4x4].intra_pred[17] = PFX(intra_pred_ang4_17_sse2);
p.cu[BLOCK_4x4].intra_pred[18] = PFX(intra_pred_ang4_18_sse2);
- p.cu[BLOCK_4x4].intra_pred[19] = PFX(intra_pred_ang4_17_sse2);
- p.cu[BLOCK_4x4].intra_pred[20] = PFX(intra_pred_ang4_16_sse2);
- p.cu[BLOCK_4x4].intra_pred[21] = PFX(intra_pred_ang4_15_sse2);
- p.cu[BLOCK_4x4].intra_pred[22] = PFX(intra_pred_ang4_14_sse2);
- p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_13_sse2);
- p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_12_sse2);
- p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_11_sse2);
+ p.cu[BLOCK_4x4].intra_pred[19] = PFX(intra_pred_ang4_19_sse2);
+ p.cu[BLOCK_4x4].intra_pred[20] = PFX(intra_pred_ang4_20_sse2);
+ p.cu[BLOCK_4x4].intra_pred[21] = PFX(intra_pred_ang4_21_sse2);
+ p.cu[BLOCK_4x4].intra_pred[22] = PFX(intra_pred_ang4_22_sse2);
+ p.cu[BLOCK_4x4].intra_pred[23] = PFX(intra_pred_ang4_23_sse2);
+ p.cu[BLOCK_4x4].intra_pred[24] = PFX(intra_pred_ang4_24_sse2);
+ p.cu[BLOCK_4x4].intra_pred[25] = PFX(intra_pred_ang4_25_sse2);
p.cu[BLOCK_4x4].intra_pred[26] = PFX(intra_pred_ang4_26_sse2);
- p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_9_sse2);
- p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_8_sse2);
- p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_7_sse2);
- p.cu[BLOCK_4x4].intra_pred[30] = PFX(intra_pred_ang4_6_sse2);
- p.cu[BLOCK_4x4].intra_pred[31] = PFX(intra_pred_ang4_5_sse2);
- p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_4_sse2);
- p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_3_sse2);
+ p.cu[BLOCK_4x4].intra_pred[27] = PFX(intra_pred_ang4_27_sse2);
+ p.cu[BLOCK_4x4].intra_pred[28] = PFX(intra_pred_ang4_28_sse2);
+ p.cu[BLOCK_4x4].intra_pred[29] = PFX(intra_pred_ang4_29_sse2);
+ p.cu[BLOCK_4x4].intra_pred[30] = PFX(intra_pred_ang4_30_sse2);
+ p.cu[BLOCK_4x4].intra_pred[31] = PFX(intra_pred_ang4_31_sse2);
+ p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
+ p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
diff -r b870e819ade1 -r 99018e7df815 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Sun Jun 21 18:33:58 2015 -0700
+++ b/source/common/x86/intrapred16.asm Sun Jun 21 20:52:16 2015 -0700
@@ -1030,6 +1030,43 @@
%undef INTRA_PRED_PLANAR16_AVX2
RET
+%macro TRANSPOSE_4x4 0
+ punpckhwd m0, m1, m3
+ punpcklwd m1, m3
+ punpckhwd m3, m1, m0
+ punpcklwd m1, m0
+%endmacro
+
+%macro STORE_4x4 0
+ add r1, r1
+ movh [r0], m1
+ movhps [r0 + r1], m1
+ movh [r0 + r1 * 2], m3
+ lea r1, [r1 * 3]
+ movhps [r0 + r1], m3
+%endmacro
+
+%macro CALC_4x4 4
+ mova m0, [pd_16]
+ pmaddwd m1, [ang_table + %1 * 16]
+ paddd m1, m0
+ psrld m1, 5
+
+ pmaddwd m2, [ang_table + %2 * 16]
+ paddd m2, m0
+ psrld m2, 5
+ packssdw m1, m2
+
+ pmaddwd m3, [ang_table + %3 * 16]
+ paddd m3, m0
+ psrld m3, 5
+
+ pmaddwd m4, [ang_table + %4 * 16]
+ paddd m4, m0
+ psrld m4, 5
+ packssdw m3, m4
+%endmacro
+
;-----------------------------------------------------------------------------------------
; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
@@ -1052,216 +1089,140 @@
movh [r0 + r1], m0
RET
-cglobal intra_pred_ang4_3, 3,5,8
- mov r4d, 2
- cmp r3m, byte 33
- mov r3d, 18
- cmove r3d, r4d
-
- movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
-
+cglobal intra_pred_ang4_3, 3,3,5
+ movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
mova m2, m0
psrldq m0, 2
- punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
mova m3, m0
psrldq m0, 2
- punpcklwd m3, m0 ; [6 5 5 4 4 3 3 2]
+ punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3]
mova m4, m0
psrldq m0, 2
- punpcklwd m4, m0 ; [7 6 6 5 5 4 4 3]
- mova m5, m0
+ punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4]
+
+ CALC_4x4 26, 20, 14, 8
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_33, 3,3,5
+ movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
psrldq m0, 2
- punpcklwd m5, m0 ; [8 7 7 6 6 5 5 4]
-
-
- lea r3, [ang_table + 20 * 16]
- mova m0, [r3 + 6 * 16] ; [26]
- mova m1, [r3] ; [20]
- mova m6, [r3 - 6 * 16] ; [14]
- mova m7, [r3 - 12 * 16] ; [ 8]
- jmp .do_filter4x4
-
-
-ALIGN 16
-.do_filter4x4:
- lea r4, [pd_16]
- pmaddwd m2, m0
- paddd m2, [r4]
- psrld m2, 5
-
- pmaddwd m3, m1
- paddd m3, [r4]
- psrld m3, 5
- packssdw m2, m3
-
- pmaddwd m4, m6
- paddd m4, [r4]
- psrld m4, 5
-
- pmaddwd m5, m7
- paddd m5, [r4]
- psrld m5, 5
- packssdw m4, m5
-
- jz .store
-
- ; transpose 4x4
- punpckhwd m0, m2, m4
- punpcklwd m2, m4
- punpckhwd m4, m2, m0
- punpcklwd m2, m0
-
-.store:
- add r1, r1
- movh [r0], m2
- movhps [r0 + r1], m2
- movh [r0 + r1 * 2], m4
- lea r1, [r1 * 3]
- movhps [r0 + r1], m4
- RET
-
-cglobal intra_pred_ang4_4, 3,5,8
- mov r4d, 2
- cmp r3m, byte 32
- mov r3d, 18
- cmove r3d, r4d
-
- movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
mova m2, m0
psrldq m0, 2
- punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
mova m3, m0
psrldq m0, 2
- punpcklwd m3, m0 ; [6 5 5 4 4 3 3 2]
- mova m4, m3
- mova m5, m0
+ punpcklwd m3, m0 ;[7 6 6 5 5 4 4 3]
+ mova m4, m0
psrldq m0, 2
- punpcklwd m5, m0 ; [7 6 6 5 5 4 4 3]
-
- lea r3, [ang_table + 18 * 16]
- mova m0, [r3 + 3 * 16] ; [21]
- mova m1, [r3 - 8 * 16] ; [10]
- mova m6, [r3 + 13 * 16] ; [31]
- mova m7, [r3 + 2 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_5, 3,5,8
- mov r4d, 2
- cmp r3m, byte 31
- mov r3d, 18
- cmove r3d, r4d
-
- movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
+ punpcklwd m4, m0 ;[8 7 7 6 6 5 5 4]
+
+ CALC_4x4 26, 20, 14, 8
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_4, 3,3,5
+ movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
mova m2, m0
psrldq m0, 2
- punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
- mova m3, m0
- psrldq m0, 2
- punpcklwd m3, m0 ; [6 5 5 4 4 3 3 2]
- mova m4, m3
- mova m5, m0
- psrldq m0, 2
- punpcklwd m5, m0 ; [7 6 6 5 5 4 4 3]
-
- lea r3, [ang_table + 10 * 16]
- mova m0, [r3 + 7 * 16] ; [17]
- mova m1, [r3 - 8 * 16] ; [ 2]
- mova m6, [r3 + 9 * 16] ; [19]
- mova m7, [r3 - 6 * 16] ; [ 4]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_6, 3,5,8
- mov r4d, 2
- cmp r3m, byte 30
- mov r3d, 18
- cmove r3d, r4d
-
- movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
- mova m2, m0
- psrldq m0, 2
- punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
+ punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
mova m3, m2
mova m4, m0
psrldq m0, 2
- punpcklwd m4, m0 ; [6 5 5 4 4 3 3 2]
- mova m5, m4
-
- lea r3, [ang_table + 19 * 16]
- mova m0, [r3 - 6 * 16] ; [13]
- mova m1, [r3 + 7 * 16] ; [26]
- mova m6, [r3 - 12 * 16] ; [ 7]
- mova m7, [r3 + 1 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_7, 3,5,8
- mov r4d, 2
- cmp r3m, byte 29
- mov r3d, 18
- cmove r3d, r4d
-
- movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
- mova m2, m0
+ punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
+
+ CALC_4x4 21, 10, 31, 20
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_6, 3,3,5
+ movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
psrldq m0, 2
- punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
- mova m3, m2
- mova m4, m2
- mova m5, m0
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m0
psrldq m0, 2
- punpcklwd m5, m0 ; [6 5 5 4 4 3 3 2]
-
- lea r3, [ang_table + 20 * 16]
- mova m0, [r3 - 11 * 16] ; [ 9]
- mova m1, [r3 - 2 * 16] ; [18]
- mova m6, [r3 + 7 * 16] ; [27]
- mova m7, [r3 - 16 * 16] ; [ 4]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_8, 3,5,8
- mov r4d, 2
- cmp r3m, byte 28
- mov r3d, 18
- cmove r3d, r4d
-
- movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
- mova m2, m0
+ punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2]
+ mova m4, m3
+
+ CALC_4x4 13, 26, 7, 20
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_7, 3,3,5
+ movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
psrldq m0, 2
- punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
- mova m3, m2
- mova m4, m2
- mova m5, m2
-
- lea r3, [ang_table + 13 * 16]
- mova m0, [r3 - 8 * 16] ; [ 5]
- mova m1, [r3 - 3 * 16] ; [10]
- mova m6, [r3 + 2 * 16] ; [15]
- mova m7, [r3 + 7 * 16] ; [20]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_9, 3,5,8
- mov r4d, 2
- cmp r3m, byte 27
- mov r3d, 18
- cmove r3d, r4d
-
- movu m0, [r2 + r3] ; [8 7 6 5 4 3 2 1]
- mova m2, m0
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m0
psrldq m0, 2
- punpcklwd m2, m0 ; [5 4 4 3 3 2 2 1]
- mova m3, m2
- mova m4, m2
- mova m5, m2
-
- lea r3, [ang_table + 4 * 16]
- mova m0, [r3 - 2 * 16] ; [ 2]
- mova m1, [r3 - 0 * 16] ; [ 4]
- mova m6, [r3 + 2 * 16] ; [ 6]
- mova m7, [r3 + 4 * 16] ; [ 8]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+ punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2]
+
+ CALC_4x4 9, 18, 27, 4
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_8, 3,3,5
+ movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 5, 10, 15, 20
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_9, 3,3,5
+ movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 2, 4, 6, 8
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
cglobal intra_pred_ang4_10, 3,3,3
- movh m0, [r2 + 18] ; [4 3 2 1]
-
- punpcklwd m0, m0 ;[4 4 3 3 2 2 1 1]
+ movh m0, [r2 + 18] ;[4 3 2 1]
+
+ punpcklwd m0, m0 ;[4 4 3 3 2 2 1 1]
pshufd m1, m0, 0xFA
add r1, r1
pshufd m0, m0, 0x50
@@ -1274,7 +1235,7 @@
jz .quit
; filter
- movd m2, [r2] ; [7 6 5 4 3 2 1 0]
+ movd m2, [r2] ;[7 6 5 4 3 2 1 0]
pshuflw m2, m2, 0x00
movh m1, [r2 + 2]
psubw m1, m2
@@ -1287,218 +1248,139 @@
movh [r0], m0
RET
-cglobal intra_pred_ang4_26, 3,3,3
- movh m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- add r1d, r1d
- ; store
- movh [r0], m0
- movh [r0 + r1], m0
- movh [r0 + r1 * 2], m0
- lea r3, [r1 * 3]
- movh [r0 + r3], m0
-
- ; filter
- cmp r4m, byte 0
- jz .quit
-
- pshuflw m0, m0, 0x00
- movd m2, [r2]
- pshuflw m2, m2, 0x00
- movh m1, [r2 + 18]
- psubw m1, m2
- psraw m1, 1
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
-
- movh r2, m0
- mov [r0], r2w
- shr r2, 16
- mov [r0 + r1], r2w
- shr r2, 16
- mov [r0 + r1 * 2], r2w
- shr r2, 16
- mov [r0 + r3], r2w
-.quit:
- RET
-
-cglobal intra_pred_ang4_11, 3,5,8
- xor r4d, r4d
- cmp r3m, byte 25
- mov r3d, 16
- cmove r3d, r4d
-
- movh m1, [r2 + r3 + 2] ; [x x x 4 3 2 1 0]
- movh m2, [r2 - 6]
- punpcklqdq m2, m1
- psrldq m2, 6
- punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
+cglobal intra_pred_ang4_11, 3,3,5
+ movh m0, [r2 + 18] ;[x x x 4 3 2 1 0]
+ movh m1, [r2 - 6]
+ punpcklqdq m1, m0
+ psrldq m1, 6
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 30, 28, 26, 24
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_12, 3,3,5
+ movh m0, [r2 + 18]
+ movh m1, [r2 - 6]
+ punpcklqdq m1, m0
+ psrldq m1, 6
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 27, 22, 17, 12
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_13, 3,3,5
+ movd m4, [r2 + 6]
+ movd m1, [r2 - 2]
+ movh m0, [r2 + 18]
+ punpcklwd m4, m1
+ punpcklqdq m4, m0
+ psrldq m4, 4
+ mova m1, m4
+ psrldq m1, 2
+ punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m1
+
+ CALC_4x4 23, 14, 5, 28
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_14, 3,3,5
+ movd m4, [r2 + 2]
+ movd m1, [r2 - 2]
+ movh m0, [r2 + 18]
+ punpcklwd m4, m1
+ punpcklqdq m4, m0
+ psrldq m4, 4
+ mova m1, m4
+ psrldq m1, 2
+ punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m4
+
+ CALC_4x4 19, 6, 25, 12
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_15, 3,3,5
+ movd m3, [r2] ;[x x x A]
+ movh m4, [r2 + 4] ;[x C x B]
+ movh m0, [r2 + 18] ;[4 3 2 1]
+ pshuflw m4, m4, 0x22 ;[B C B C]
+ punpcklqdq m4, m3 ;[x x x A B C B C]
+ psrldq m4, 2 ;[x x x x A B C B]
+ punpcklqdq m4, m0
+ psrldq m4, 2
+ mova m1, m4
+ mova m2, m4
+ psrldq m1, 4
+ psrldq m2, 2
+ punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
+ punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
mova m3, m2
- mova m4, m2
- mova m5, m2
-
- lea r3, [ang_table + 24 * 16]
- mova m0, [r3 + 6 * 16] ; [24]
- mova m1, [r3 + 4 * 16] ; [26]
- mova m6, [r3 + 2 * 16] ; [28]
- mova m7, [r3 + 0 * 16] ; [30]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_12, 3,5,8
- xor r4d, r4d
- cmp r3m, byte 24
- mov r3d, 16
- cmove r3d, r4d
-
- movh m1, [r2 + r3 + 2]
- movh m2, [r2 - 6]
- punpcklqdq m2, m1
- psrldq m2, 6
- punpcklwd m2, m1 ; [4 3 3 2 2 1 1 0]
+
+ CALC_4x4 15, 30, 13, 28
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_16, 3,3,5
+ movd m3, [r2] ;[x x x A]
+ movd m4, [r2 + 4] ;[x x C B]
+ movh m0, [r2 + 18] ;[4 3 2 1]
+ punpcklwd m4, m3 ;[x C A B]
+ pshuflw m4, m4, 0x4A ;[A B C C]
+ punpcklqdq m4, m0 ;[4 3 2 1 A B C C]
+ psrldq m4, 2
+ mova m1, m4
+ mova m2, m4
+ psrldq m1, 4
+ psrldq m2, 2
+ punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
+ punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
mova m3, m2
- mova m4, m2
- mova m5, m2
-
- lea r3, [ang_table + 20 * 16]
- mova m0, [r3 + 7 * 16] ; [27]
- mova m1, [r3 + 2 * 16] ; [22]
- mova m6, [r3 - 3 * 16] ; [17]
- mova m7, [r3 - 8 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_13, 3,5,8
- xor r4d, r4d
- cmp r3m, byte 23
- mov r3d, 16
- jz .next
- xchg r3d, r4d
-.next:
- movd m5, [r2 + r3 + 6]
- movd m2, [r2 - 2]
- movh m0, [r2 + r4 + 2]
- punpcklwd m5, m2
- punpcklqdq m5, m0
- psrldq m5, 4
- mova m2, m5
- psrldq m2, 2
- punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m3, m2
- mova m4, m2
-
- lea r3, [ang_table + 21 * 16]
- mova m0, [r3 + 2 * 16] ; [23]
- mova m1, [r3 - 7 * 16] ; [14]
- mova m6, [r3 - 16 * 16] ; [ 5]
- mova m7, [r3 + 7 * 16] ; [28]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_14, 3,5,8
- xor r4d, r4d
- cmp r3m, byte 22
- mov r3d, 16
- jz .next
- xchg r3d, r4d
-.next:
- movd m5, [r2 + r3 + 2]
- movd m2, [r2 - 2]
- movh m0, [r2 + r4 + 2]
- punpcklwd m5, m2
- punpcklqdq m5, m0
- psrldq m5, 4
- mova m2, m5
- psrldq m2, 2
- punpcklwd m5, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m3, m2
- mova m4, m5
-
- lea r3, [ang_table + 19 * 16]
- mova m0, [r3 + 0 * 16] ; [19]
- mova m1, [r3 - 13 * 16] ; [ 6]
- mova m6, [r3 + 6 * 16] ; [25]
- mova m7, [r3 - 7 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_15, 3,5,8
- xor r4d, r4d
- cmp r3m, byte 21
- mov r3d, 16
- jz .next
- xchg r3d, r4d
-.next:
- movd m4, [r2] ;[x x x A]
- movh m5, [r2 + r3 + 4] ;[x C x B]
- movh m0, [r2 + r4 + 2] ;[4 3 2 1]
- pshuflw m5, m5, 0x22 ;[B C B C]
- punpcklqdq m5, m4 ;[x x x A B C B C]
- psrldq m5, 2 ;[x x x x A B C B]
- punpcklqdq m5, m0
- psrldq m5, 2
- mova m2, m5
- mova m3, m5
- psrldq m2, 4
- psrldq m3, 2
- punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
- punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m4, m3
-
- lea r3, [ang_table + 23 * 16]
- mova m0, [r3 - 8 * 16] ; [15]
- mova m1, [r3 + 7 * 16] ; [30]
- mova m6, [r3 - 10 * 16] ; [13]
- mova m7, [r3 + 5 * 16] ; [28]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_16, 3,5,8
- xor r4d, r4d
- cmp r3m, byte 20
- mov r3d, 16
- jz .next
- xchg r3d, r4d
-.next:
- movd m4, [r2] ;[x x x A]
- movd m5, [r2 + r3 + 4] ;[x x C B]
- movh m0, [r2 + r4 + 2] ;[4 3 2 1]
- punpcklwd m5, m4 ;[x C A B]
- pshuflw m5, m5, 0x4A ;[A B C C]
- punpcklqdq m5, m0 ;[4 3 2 1 A B C C]
- psrldq m5, 2
- mova m2, m5
- mova m3, m5
- psrldq m2, 4
- psrldq m3, 2
- punpcklwd m5, m3 ; [2 1 1 0 0 x x y]
- punpcklwd m3, m2 ; [3 2 2 1 1 0 0 x]
- punpcklwd m2, m0 ; [4 3 3 2 2 1 1 0]
- mova m4, m3
-
- lea r3, [ang_table + 19 * 16]
- mova m0, [r3 - 8 * 16] ; [11]
- mova m1, [r3 + 3 * 16] ; [22]
- mova m6, [r3 - 18 * 16] ; [ 1]
- mova m7, [r3 - 7 * 16] ; [12]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
-
-cglobal intra_pred_ang4_17, 3,5,8
- xor r4d, r4d
- cmp r3m, byte 19
- mov r3d, 16
- jz .next
- xchg r3d, r4d
-.next:
- movd m4, [r2]
- movh m5, [r2 + r3 + 2] ;[D x C B]
- pshuflw m5, m5, 0x1F ;[B C D D]
- punpcklqdq m5, m4 ;[x x x A B C D D]
- psrldq m5, 2 ;[x x x x A B C D]
- movhps m5, [r2 + r4 + 2]
-
- mova m4, m5
- psrldq m4, 2
- punpcklwd m5, m4
+
+ CALC_4x4 11, 22, 1, 12
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_17, 3,3,5
+ movd m3, [r2]
+ movh m4, [r2 + 2] ;[D x C B]
+ pshuflw m4, m4, 0x1F ;[B C D D]
+ punpcklqdq m4, m3 ;[x x x A B C D D]
+ psrldq m4, 2 ;[x x x x A B C D]
+ movhps m4, [r2 + 18]
+
mova m3, m4
psrldq m3, 2
punpcklwd m4, m3
@@ -1508,13 +1390,16 @@
mova m1, m2
psrldq m1, 2
punpcklwd m2, m1
-
- lea r3, [ang_table + 14 * 16]
- mova m0, [r3 - 8 * 16] ; [ 6]
- mova m1, [r3 - 2 * 16] ; [12]
- mova m6, [r3 + 4 * 16] ; [18]
- mova m7, [r3 + 10 * 16] ; [24]
- jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4)
+ mova m0, m1
+ psrldq m0, 2
+ punpcklwd m1, m0
+
+ CALC_4x4 6, 12, 18, 24
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
cglobal intra_pred_ang4_18, 3,3,1
movh m0, [r2 + 16]
@@ -1532,6 +1417,298 @@
movh [r0], m0
RET
+ cglobal intra_pred_ang4_19, 3,3,5
+ movd m3, [r2]
+ movh m4, [r2 + 18] ;[D x C B]
+ pshuflw m4, m4, 0x1F ;[B C D D]
+ punpcklqdq m4, m3 ;[x x x A B C D D]
+ psrldq m4, 2 ;[x x x x A B C D]
+ movhps m4, [r2 + 2]
+
+ mova m3, m4
+ psrldq m3, 2
+ punpcklwd m4, m3
+ mova m2, m3
+ psrldq m2, 2
+ punpcklwd m3, m2
+ mova m1, m2
+ psrldq m1, 2
+ punpcklwd m2, m1
+ mova m0, m1
+ psrldq m0, 2
+ punpcklwd m1, m0
+
+ CALC_4x4 6, 12, 18, 24
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_20, 3,3,5
+ movd m3, [r2] ;[x x x A]
+ movd m4, [r2 + 20] ;[x x C B]
+ movh m0, [r2 + 2] ;[4 3 2 1]
+ punpcklwd m4, m3 ;[x C A B]
+ pshuflw m4, m4, 0x4A ;[A B C C]
+ punpcklqdq m4, m0 ;[4 3 2 1 A B C C]
+ psrldq m4, 2
+ mova m1, m4
+ mova m2, m4
+ psrldq m1, 4
+ psrldq m2, 2
+ punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
+ punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m3, m2
+
+ CALC_4x4 11, 22, 1, 12
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_21, 3,3,5
+ movd m3, [r2] ;[x x x A]
+ movh m4, [r2 + 20] ;[x C x B]
+ movh m0, [r2 + 2] ;[4 3 2 1]
+ pshuflw m4, m4, 0x22 ;[B C B C]
+ punpcklqdq m4, m3 ;[x x x A B C B C]
+ psrldq m4, 2 ;[x x x x A B C B]
+ punpcklqdq m4, m0
+ psrldq m4, 2
+ mova m1, m4
+ mova m2, m4
+ psrldq m1, 4
+ psrldq m2, 2
+ punpcklwd m4, m2 ;[2 1 1 0 0 x x y]
+ punpcklwd m2, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m3, m2
+
+ CALC_4x4 15, 30, 13, 28
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_22, 3,3,5
+ movd m4, [r2 + 18]
+ movd m1, [r2 - 2]
+ movh m0, [r2 + 2]
+ punpcklwd m4, m1
+ punpcklqdq m4, m0
+ psrldq m4, 4
+ mova m1, m4
+ psrldq m1, 2
+ punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m4
+
+ CALC_4x4 19, 6, 25, 12
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_23, 3,3,5
+ movd m4, [r2 + 22]
+ movd m1, [r2 - 2]
+ movh m0, [r2 + 2]
+ punpcklwd m4, m1
+ punpcklqdq m4, m0
+ psrldq m4, 4
+ mova m1, m4
+ psrldq m1, 2
+ punpcklwd m4, m1 ;[3 2 2 1 1 0 0 x]
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m1
+
+ CALC_4x4 23, 14, 5, 28
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_24, 3,3,5
+ movh m0, [r2 + 2]
+ movh m1, [r2 - 6]
+ punpcklqdq m1, m0
+ psrldq m1, 6
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 27, 22, 17, 12
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_25, 3,3,5
+ movh m0, [r2 + 2] ;[x x x 4 3 2 1 0]
+ movh m1, [r2 - 6]
+ punpcklqdq m1, m0
+ psrldq m1, 6
+ punpcklwd m1, m0 ;[4 3 3 2 2 1 1 0]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 30, 28, 26, 24
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_26, 3,3,3
+ movh m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ add r1d, r1d
+ ; store
+ movh [r0], m0
+ movh [r0 + r1], m0
+ movh [r0 + r1 * 2], m0
+ lea r3, [r1 * 3]
+ movh [r0 + r3], m0
+
+ ; filter
+ cmp r4m, byte 0
+ jz .quit
+
+ pshuflw m0, m0, 0x00
+ movd m2, [r2]
+ pshuflw m2, m2, 0x00
+ movh m1, [r2 + 18]
+ psubw m1, m2
+ psraw m1, 1
+ paddw m0, m1
+ pxor m1, m1
+ pmaxsw m0, m1
+ pminsw m0, [pw_1023]
+
+ movh r2, m0
+ mov [r0], r2w
+ shr r2, 16
+ mov [r0 + r1], r2w
+ shr r2, 16
+ mov [r0 + r1 * 2], r2w
+ shr r2, 16
+ mov [r0 + r3], r2w
+.quit:
+ RET
+
+cglobal intra_pred_ang4_27, 3,3,5
+ movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 2, 4, 6, 8
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_28, 3,3,5
+
+ movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m1
+
+ CALC_4x4 5, 10, 15, 20
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_29, 3,3,5
+ movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m1
+ mova m4, m0
+ psrldq m0, 2
+ punpcklwd m4, m0 ;[6 5 5 4 4 3 3 2]
+
+ CALC_4x4 9, 18, 27, 4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_30, 3,3,5
+ movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m1
+ mova m3, m0
+ psrldq m0, 2
+ punpcklwd m3, m0 ;[6 5 5 4 4 3 3 2]
+ mova m4, m3
+
+ CALC_4x4 13, 26, 7, 20
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_5, 3,3,5
+ movu m0, [r2 + 18] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
+ mova m3, m2
+ mova m4, m0
+ psrldq m0, 2
+ punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
+
+ CALC_4x4 17, 2, 19, 4
+
+ TRANSPOSE_4x4
+
+ STORE_4x4
+ RET
+
+cglobal intra_pred_ang4_31, 3,3,5
+ movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
+ mova m3, m2
+ mova m4, m0
+ psrldq m0, 2
+ punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
+
+ CALC_4x4 17, 2, 19, 4
+
+ STORE_4x4
+ RET
+
+ cglobal intra_pred_ang4_32, 3,3,5
+ movu m0, [r2 + 2] ;[8 7 6 5 4 3 2 1]
+ mova m1, m0
+ psrldq m0, 2
+ punpcklwd m1, m0 ;[5 4 4 3 3 2 2 1]
+ mova m2, m0
+ psrldq m0, 2
+ punpcklwd m2, m0 ;[6 5 5 4 4 3 3 2]
+ mova m3, m2
+ mova m4, m0
+ psrldq m0, 2
+ punpcklwd m4, m0 ;[7 6 6 5 5 4 4 3]
+
+ CALC_4x4 21, 10, 31, 20
+
+ STORE_4x4
+ RET
+
;-----------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter)
;-----------------------------------------------------------------------------------
More information about the x265-devel
mailing list