[x265-commits] [x265] asm: improve sad[32x32] 10% by unroll loop
Min Chen
chenm003 at 163.com
Tue Mar 17 02:01:47 CET 2015
details: http://hg.videolan.org/x265/rev/85943035ce6e
branches:
changeset: 9762:85943035ce6e
user: Min Chen <chenm003 at 163.com>
date: Mon Mar 16 14:00:36 2015 -0700
description:
asm: improve sad[32x32] 10% by unroll loop
Subject: [x265] tweak smoke test coverage
details: http://hg.videolan.org/x265/rev/ba495e2a54e5
branches:
changeset: 9763:ba495e2a54e5
user: Steve Borho <steve at borho.org>
date: Mon Mar 16 19:28:14 2015 -0500
description:
tweak smoke test coverage
Cover all presets except placebo in at least one test, remove redundant tests
Only measure slower and veryslow at lower resolutions - try to keep the total
runtime under control, but at the same time cover many more features
diffstat:
source/common/x86/sad-a.asm | 27 +++++++++++++++++++--------
source/test/smoke-tests.txt | 35 +++++++++++++++--------------------
2 files changed, 34 insertions(+), 28 deletions(-)
diffs (92 lines):
diff -r 8a37ebcf9232 -r ba495e2a54e5 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Mon Mar 16 17:04:21 2015 -0500
+++ b/source/common/x86/sad-a.asm Mon Mar 16 19:28:14 2015 -0500
@@ -3898,9 +3898,11 @@ cglobal pixel_sad_x4_8x8, 7,7,5
RET
INIT_YMM avx2
-cglobal pixel_sad_32x32, 4,5,5
+cglobal pixel_sad_32x32, 4,7,5
xorps m0, m0
- mov r4d, 16
+ mov r4d, 32/4
+ lea r5, [r1 * 3]
+ lea r6, [r3 * 3]
.loop
movu m1, [r0] ; row 0 of pix0
@@ -3913,11 +3915,21 @@ cglobal pixel_sad_32x32, 4,5,5
paddd m0, m1
paddd m0, m3
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
-
- dec r4d
- jnz .loop
+ movu m1, [r0 + 2 * r1] ; row 2 of pix0
+ movu m2, [r2 + 2 * r3] ; row 2 of pix1
+ movu m3, [r0 + r5] ; row 3 of pix0
+ movu m4, [r2 + r6] ; row 3 of pix1
+
+ psadbw m1, m2
+ psadbw m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ lea r2, [r2 + 4 * r3]
+ lea r0, [r0 + 4 * r1]
+
+ dec r4d
+ jnz .loop
vextracti128 xm1, m0, 1
paddd xm0, xm1
@@ -3926,5 +3938,4 @@ cglobal pixel_sad_32x32, 4,5,5
movd eax, xm0
RET
-
%endif
diff -r 8a37ebcf9232 -r ba495e2a54e5 source/test/smoke-tests.txt
--- a/source/test/smoke-tests.txt Mon Mar 16 17:04:21 2015 -0500
+++ b/source/test/smoke-tests.txt Mon Mar 16 19:28:14 2015 -0500
@@ -1,23 +1,18 @@
# List of command lines to be run by smoke tests, see https://bitbucket.org/sborho/test-harness
big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
-big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000
-big_buck_bunny_360p24.y4m,--preset=slower
-washdc_422_ntsc.y4m,--preset=superfast
-washdc_422_ntsc.y4m,--preset=medium
-washdc_422_ntsc.y4m,--preset=slower
-old_town_cross_444_720p50.y4m,--preset=superfast
-old_town_cross_444_720p50.y4m,--preset=medium
-old_town_cross_444_720p50.y4m,--preset=slower
-crowd_run_1080p50.y4m,--preset=superfast
-crowd_run_1080p50.y4m,--preset=medium
-crowd_run_1080p50.y4m,--preset=slower
-RaceHorses_416x240_30_10bit.yuv,--preset=superfast
-RaceHorses_416x240_30_10bit.yuv,--preset=medium
-RaceHorses_416x240_30_10bit.yuv,--preset=slower
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=superfast
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=slower
-DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=superfast
-DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=medium
-DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=slower
+big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 --cu-lossless --scaling-list default
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --cu-stats --pme
+washdc_422_ntsc.y4m,--preset=faster --strong-intra-smoothing --keyint 1
+washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
+washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip
+old_town_cross_444_720p50.y4m,--preset=ultrafast --weightp
+old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
+old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode
+RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --cu-stats --max-tu-size 8
+RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 --rdoq-level 1
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --contrained-intra --min-keyint 5 --keyint 10
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=slow --keyint -1 --rdoq-level 0
+DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=veryfast --min-cu 16
+DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=fast --weightb --interlace bff
More information about the x265-commits
mailing list