[x265] [PATCH] asm: removed some duplicate constants in intrapred16.asm 16bpp
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed May 20 09:23:30 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1432106399 -19800
# Wed May 20 12:49:59 2015 +0530
# Node ID 55945e7be95e8fa3006afe0c37894a045daccd4a
# Parent 9b31a8a7bd57efededcc3884ee7777c09f649394
asm: removed some duplicate constants in intrapred16.asm 16bpp
also, renamed pw_planar4_1, pw_planar8_1 & pw_planar32_1 to pw_3, pw_7 & pd_31 resp. & moved into comman const-a.asm file
diff -r 9b31a8a7bd57 -r 55945e7be95e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue May 19 19:51:56 2015 -0500
+++ b/source/common/x86/const-a.asm Wed May 20 12:49:59 2015 +0530
@@ -63,6 +63,8 @@
const pw_1, times 16 dw 1
const pw_2, times 16 dw 2
+const pw_3, times 16 dw 3
+const pw_7, times 16 dw 7
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
@@ -112,6 +114,7 @@
const pd_4, times 4 dd 4
const pd_8, times 4 dd 8
const pd_16, times 4 dd 16
+const pd_31, times 4 dd 31
const pd_32, times 8 dd 32
const pd_64, times 4 dd 64
const pd_128, times 4 dd 128
diff -r 9b31a8a7bd57 -r 55945e7be95e source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue May 19 19:51:56 2015 -0500
+++ b/source/common/x86/intrapred16.asm Wed May 20 12:49:59 2015 +0530
@@ -44,7 +44,6 @@
const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
-const pw_unpackwdq, times 8 db 0,1
const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
@@ -58,16 +57,6 @@
;; (blkSize - 1 - x)
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
-pw_planar4_1: dw 3, 3, 3, 3, 3, 3, 3, 3
-pw_planar8_0: dw 7, 6, 5, 4, 3, 2, 1, 0
-pw_planar8_1: dw 7, 7, 7, 7, 7, 7, 7, 7
-pw_planar16_0: dw 15, 14, 13, 12, 11, 10, 9, 8
-pw_planar16_1: dw 15, 15, 15, 15, 15, 15, 15, 15
-pd_planar32_1: dd 31, 31, 31, 31
-
-pw_planar32_1: dw 31, 31, 31, 31, 31, 31, 31, 31
-pw_planar32_L: dw 31, 30, 29, 28, 27, 26, 25, 24
-pw_planar32_H: dw 23, 22, 21, 20, 19, 18, 17, 16
const planar32_table
%assign x 31
@@ -85,8 +74,11 @@
SECTION .text
+cextern pb_01
cextern pw_1
cextern pw_2
+cextern pw_3
+cextern pw_7
cextern pw_4
cextern pw_8
cextern pw_15
@@ -95,6 +87,7 @@
cextern pw_32
cextern pw_1023
cextern pd_16
+cextern pd_31
cextern pd_32
cextern pw_4096
cextern multiL
@@ -681,7 +674,7 @@
pshufd m4, m4, 0 ; v_bottomLeft
pmullw m3, [multiL] ; (x + 1) * topRight
- pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
paddw m3, [pw_8]
paddw m3, m4
paddw m3, m0
@@ -695,7 +688,7 @@
pshufhw m1, m2, 0x55 * (%1 - 4)
pshufd m1, m1, 0xAA
%endif
- pmullw m1, [pw_planar8_0]
+ pmullw m1, [pw_planar16_mul + mmsize]
paddw m1, m3
psraw m1, 4
movu [r0], m1
@@ -733,8 +726,8 @@
pmullw m4, m3, [multiH] ; (x + 1) * topRight
pmullw m3, [multiL] ; (x + 1) * topRight
- pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
- pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
+ pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
paddw m4, [pw_16]
paddw m3, [pw_16]
paddw m4, m6
@@ -770,8 +763,8 @@
paddw m4, m1
lea r0, [r0 + r1 * 2]
%endif
- pmullw m0, m5, [pw_planar8_0]
- pmullw m5, [pw_planar16_0]
+ pmullw m0, m5, [pw_planar16_mul + mmsize]
+ pmullw m5, [pw_planar16_mul]
paddw m0, m4
paddw m5, m3
psraw m5, 5
@@ -827,7 +820,7 @@
mova m9, m6
mova m10, m6
- mova m12, [pw_planar32_1]
+ mova m12, [pw_31]
movu m4, [r2 + 2]
psubw m8, m4
pmullw m4, m12
@@ -848,10 +841,10 @@
pmullw m5, m12
paddw m3, m5
- mova m12, [pw_planar32_L]
- mova m13, [pw_planar32_H]
- mova m14, [pw_planar16_0]
- mova m15, [pw_planar8_0]
+ mova m12, [pw_planar32_mul]
+ mova m13, [pw_planar32_mul + mmsize]
+ mova m14, [pw_planar16_mul]
+ mova m15, [pw_planar16_mul + mmsize]
add r1, r1
%macro PROCESS 1
@@ -1596,7 +1589,7 @@
pshufd m4, m4, 0xAA
pmullw m3, [multi_2Row] ; (x + 1) * topRight
- pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
paddw m3, [pw_4]
paddw m3, m4
@@ -1934,7 +1927,7 @@
pshufd m4, m4, 0xAA
pmullw m3, [multi_2Row] ; (x + 1) * topRight
- pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) * above[x]
paddw m3, [pw_4]
paddw m3, m4
@@ -1990,12 +1983,12 @@
pshufd m4, m4, 0 ; v_bottomLeft
pmullw m3, [multiL] ; (x + 1) * topRight
- pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) * above[x]
paddw m3, [pw_8]
paddw m3, m4
paddw m3, m0
psubw m4, m1
- mova m0, [pw_planar8_0]
+ mova m0, [pw_planar16_mul + mmsize]
%macro INTRA_PRED_PLANAR8 1
%if (%1 < 4)
@@ -2042,8 +2035,8 @@
pmullw m4, m3, [multiH] ; (x + 1) * topRight
pmullw m3, [multiL] ; (x + 1) * topRight
- pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
- pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) * above[x]
+ pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) * above[x]
+ pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) * above[x]
paddw m4, [pw_16]
paddw m3, [pw_16]
paddw m4, m6
@@ -2074,8 +2067,8 @@
%endif
%endif
%endif
- pmullw m0, m5, [pw_planar8_0]
- pmullw m5, [pw_planar16_0]
+ pmullw m0, m5, [pw_planar16_mul + mmsize]
+ pmullw m5, [pw_planar16_mul]
paddw m0, m4
paddw m5, m3
paddw m3, m6
@@ -2192,28 +2185,28 @@
; above[0-3] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 2]
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m0, m5
psubd m5, m6, m4
mova m8, m5
; above[4-7] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 10]
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m1, m5
psubd m5, m6, m4
mova m9, m5
; above[8-11] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 18]
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m2, m5
psubd m5, m6, m4
mova m10, m5
; above[12-15] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 26]
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m3, m5
psubd m5, m6, m4
mova m11, m5
@@ -2221,7 +2214,7 @@
; above[16-19] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 34]
mova m7, m12
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m7, m5
mova m12, m7
psubd m5, m6, m4
@@ -2230,7 +2223,7 @@
; above[20-23] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 42]
mova m7, m13
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m7, m5
mova m13, m7
psubd m5, m6, m4
@@ -2239,7 +2232,7 @@
; above[24-27] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 50]
mova m7, m14
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m7, m5
mova m14, m7
psubd m5, m6, m4
@@ -2248,7 +2241,7 @@
; above[28-31] * (blkSize - 1 - y)
pmovzxwd m4, [r2 + 58]
mova m7, m15
- pmulld m5, m4, [pd_planar32_1]
+ pmulld m5, m4, [pd_31]
paddd m7, m5
mova m15, m7
psubd m5, m6, m4
@@ -3766,33 +3759,33 @@
RET
cglobal intra_pred_ang8_10, 3,6,3
- movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
- pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
+ movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
+ pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1]
add r1, r1
lea r3, [r1 * 3]
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
+ pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2]
movu [r0 + r1], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
+ pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3]
movu [r0 + r1 * 2], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
+ pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4]
movu [r0 + r3], m2
lea r5, [r0 + r1 *4]
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
+ pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5]
movu [r5], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
+ pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6]
movu [r5 + r1], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
+ pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7]
movu [r5 + r1 * 2], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
+ pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8]
movu [r5 + r3], m2
cmp r4m, byte 0
@@ -3801,7 +3794,7 @@
; filter
movh m1, [r2] ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
+ pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
psubw m1, m2
psraw m1, 1
@@ -5671,9 +5664,9 @@
jz .quit
; filter
- pshufb m0, [pw_unpackwdq]
+ pshufb m0, [pb_01]
pinsrw m1, [r2], 0 ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
+ pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
movu m1, [r2 + 2 + 32] ; [8 7 6 5 4 3 2 1]
psubw m1, m2
psraw m1, 1
@@ -10006,73 +9999,73 @@
mov r5d, r4m
movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1]
movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9]
- pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
+ pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1]
add r1, r1
lea r4, [r1 * 3]
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
+ pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2]
movu [r0 + r1], m2
movu [r0 + r1 + 16], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
+ pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3]
movu [r0 + r1 * 2], m2
movu [r0 + r1 * 2 + 16], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
+ pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4]
movu [r0 + r4], m2
movu [r0 + r4 + 16], m2
lea r3, [r0 + r1 *4]
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
+ pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5]
movu [r3], m2
movu [r3 + 16], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
+ pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6]
movu [r3 + r1], m2
movu [r3 + r1 + 16], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
+ pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7]
movu [r3 + r1 * 2], m2
movu [r3 + r1 * 2 + 16], m2
psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
+ pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8]
movu [r3 + r4], m2
movu [r3 + r4 + 16], m2
lea r3, [r3 + r1 *4]
- pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9 9 9 9 9]
+ pshufb m2, m3, [pb_01] ; [9 9 9 9 9 9 9 9]
movu [r3], m2
movu [r3 + 16], m2
psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [10 10 10 10 10 10 10 10]
+ pshufb m2, m3, [pb_01] ; [10 10 10 10 10 10 10 10]
movu [r3 + r1], m2
movu [r3 + r1 + 16], m2
psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [11 11 11 11 11 11 11 11]
+ pshufb m2, m3, [pb_01] ; [11 11 11 11 11 11 11 11]
movu [r3 + r1 * 2], m2
movu [r3 + r1 * 2 + 16], m2
psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [12 12 12 12 12 12 12 12]
+ pshufb m2, m3, [pb_01] ; [12 12 12 12 12 12 12 12]
movu [r3 + r4], m2
movu [r3 + r4 + 16], m2
lea r3, [r3 + r1 *4]
psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [13 13 13 13 13 13 13 13]
+ pshufb m2, m3, [pb_01] ; [13 13 13 13 13 13 13 13]
movu [r3], m2
movu [r3 + 16], m2
psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [14 14 14 14 14 14 14 14]
+ pshufb m2, m3, [pb_01] ; [14 14 14 14 14 14 14 14]
movu [r3 + r1], m2
movu [r3 + r1 + 16], m2
psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [15 15 15 15 15 15 15 15]
+ pshufb m2, m3, [pb_01] ; [15 15 15 15 15 15 15 15]
movu [r3 + r1 * 2], m2
movu [r3 + r1 * 2 + 16], m2
psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [16 16 16 16 16 16 16 16]
+ pshufb m2, m3, [pb_01] ; [16 16 16 16 16 16 16 16]
movu [r3 + r4], m2
movu [r3 + r4 + 16], m2
mova m3, m0
@@ -10082,7 +10075,7 @@
; filter
pinsrw m1, [r2], 0 ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
+ pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
psubw m1, m2
@@ -10152,9 +10145,9 @@
; filter
- pshufb m0, [pw_unpackwdq]
+ pshufb m0, [pb_01]
pinsrw m1, [r2], 0 ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
+ pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3 2 1]
movu m3, [r2 + 18 + 64] ; [16 15 14 13 12 11 10 9]
psubw m1, m2
More information about the x265-devel
mailing list