[x265] [PATCH] asm: fix sad[32xN] avx2 code for main12
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue Sep 29 10:37:01 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1443515427 -19800
# Tue Sep 29 14:00:27 2015 +0530
# Node ID d7cde5dbec838f53e87faceac989d6cd987bfc72
# Parent ff279fe6bcccbc5dbe384194a1332b1e96595b61
asm: fix sad[32xN] avx2 code for main12
diff -r ff279fe6bccc -r d7cde5dbec83 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Sep 29 12:07:46 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Sep 29 14:00:27 2015 +0530
@@ -1625,12 +1625,12 @@
p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
-#if X265_DEPTH <= 10
p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx2);
p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx2);
p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx2);
+#if X265_DEPTH <= 10
p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx2);
p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx2);
p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
diff -r ff279fe6bccc -r d7cde5dbec83 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Tue Sep 29 12:07:46 2015 +0530
+++ b/source/common/x86/sad16-a.asm Tue Sep 29 14:00:27 2015 +0530
@@ -449,13 +449,14 @@
RET
INIT_YMM avx2
-cglobal pixel_sad_32x8, 4,7,5
+cglobal pixel_sad_32x8, 4,7,7
pxor m0, m0
mov r4d, 8/4
+ mova m6, [pw_1]
add r3d, r3d
add r1d, r1d
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
+ lea r5d, [r1 * 3]
+ lea r6d, [r3 * 3]
.loop:
movu m1, [r2]
movu m2, [r2 + 32]
@@ -471,8 +472,7 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m5, m1, m3
movu m1, [r2 + 2 * r3]
movu m2, [r2 + 2 * r3 + 32]
@@ -490,24 +490,28 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m1, m3
+ pmaddwd m5, m6
+ paddd m0, m5
+ pmaddwd m1, m6
+ paddd m0, m1
dec r4d
jg .loop
- HADDW m0, m1
+ HADDD m0, m1
movd eax, xm0
RET
INIT_YMM avx2
-cglobal pixel_sad_32x16, 4,7,5
+cglobal pixel_sad_32x16, 4,7,7
pxor m0, m0
mov r4d, 16/8
+ mova m6, [pw_1]
add r3d, r3d
add r1d, r1d
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
+ lea r5d, [r1 * 3]
+ lea r6d, [r3 * 3]
.loop:
movu m1, [r2]
movu m2, [r2 + 32]
@@ -523,8 +527,7 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m5, m1, m3
movu m1, [r2 + 2 * r3]
movu m2, [r2 + 2 * r3 + 32]
@@ -542,8 +545,12 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m1, m3
+
+ pmaddwd m5, m6
+ paddd m0, m5
+ pmaddwd m1, m6
+ paddd m0, m1
movu m1, [r2]
movu m2, [r2 + 32]
@@ -559,8 +566,7 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m5, m1, m3
movu m1, [r2 + 2 * r3]
movu m2, [r2 + 2 * r3 + 32]
@@ -578,24 +584,28 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m1, m3
+ pmaddwd m5, m6
+ paddd m0, m5
+ pmaddwd m1, m6
+ paddd m0, m1
dec r4d
jg .loop
- HADDW m0, m1
+ HADDD m0, m1
movd eax, xm0
RET
INIT_YMM avx2
-cglobal pixel_sad_32x24, 4,7,5
+cglobal pixel_sad_32x24, 4,7,7
pxor m0, m0
mov r4d, 24/4
+ mova m6, [pw_1]
add r3d, r3d
add r1d, r1d
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
+ lea r5d, [r1 * 3]
+ lea r6d, [r3 * 3]
.loop:
movu m1, [r2]
movu m2, [r2 + 32]
@@ -611,8 +621,7 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m5, m1, m3
movu m1, [r2 + 2 * r3]
movu m2, [r2 + 2 * r3 + 32]
@@ -628,29 +637,30 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
-
+ paddw m1, m3
+ pmaddwd m5, m6
+ paddd m0, m5
+ pmaddwd m1, m6
+ paddd m0, m1
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
dec r4d
jg .loop
- HADDUWD m0, m1
HADDD m0, m1
movd eax, xm0
RET
-
INIT_YMM avx2
-cglobal pixel_sad_32x32, 4,7,5
+cglobal pixel_sad_32x32, 4,7,7
pxor m0, m0
mov r4d, 32/4
+ mova m6, [pw_1]
add r3d, r3d
add r1d, r1d
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
+ lea r5d, [r1 * 3]
+ lea r6d, [r3 * 3]
.loop:
movu m1, [r2]
movu m2, [r2 + 32]
@@ -666,8 +676,7 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m5, m1, m3
movu m1, [r2 + 2 * r3]
movu m2, [r2 + 2 * r3 + 32]
@@ -683,8 +692,12 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m0, m3
+ paddw m1, m3
+
+ pmaddwd m5, m6
+ paddd m0, m5
+ pmaddwd m1, m6
+ paddd m0, m1
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
@@ -692,20 +705,19 @@
dec r4d
jg .loop
- HADDUWD m0, m1
HADDD m0, m1
movd eax, xm0
RET
INIT_YMM avx2
-cglobal pixel_sad_32x64, 4,7,6
+cglobal pixel_sad_32x64, 4,7,7
pxor m0, m0
- pxor m5, m5
mov r4d, 64 / 4
+ mova m6, [pw_1]
add r3d, r3d
add r1d, r1d
- lea r5, [r1 * 3]
- lea r6, [r3 * 3]
+ lea r5d, [r1 * 3]
+ lea r6d, [r3 * 3]
.loop:
movu m1, [r2]
movu m2, [r2 + 32]
@@ -721,8 +733,7 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m5, m3
+ paddw m5, m1, m3
movu m1, [r2 + 2 * r3]
movu m2, [r2 + 2 * r3 + 32]
@@ -738,20 +749,20 @@
pabsw m4, m4
paddw m1, m2
paddw m3, m4
- paddw m0, m1
- paddw m5, m3
+ paddw m1, m3
+
+ pmaddwd m5, m6
+ paddd m0, m5
+ pmaddwd m1, m6
+ paddd m0, m1
+
lea r0, [r0 + 4 * r1]
lea r2, [r2 + 4 * r3]
- dec r4d
+ dec r4d
jg .loop
- HADDUWD m0, m1
- HADDUWD m5, m1
HADDD m0, m1
- HADDD m5, m1
- paddd m0, m5
-
movd eax, xm0
RET
More information about the x265-devel
mailing list