[x265-commits] [x265] asm: improve sad[32x32] 10% by unroll loop

Min Chen chenm003 at 163.com
Tue Mar 17 02:01:47 CET 2015


details:   http://hg.videolan.org/x265/rev/85943035ce6e
branches:  
changeset: 9762:85943035ce6e
user:      Min Chen <chenm003 at 163.com>
date:      Mon Mar 16 14:00:36 2015 -0700
description:
asm: improve sad[32x32] 10% by unroll loop
Subject: [x265] tweak smoke test coverage

details:   http://hg.videolan.org/x265/rev/ba495e2a54e5
branches:  
changeset: 9763:ba495e2a54e5
user:      Steve Borho <steve at borho.org>
date:      Mon Mar 16 19:28:14 2015 -0500
description:
tweak smoke test coverage

Cover all presets except placebo in at least one test, remove redundant tests
Only measure slower and veryslow at lower resolutions - try to keep the total
runtime under control, but at the same time cover many more features

diffstat:

 source/common/x86/sad-a.asm |  27 +++++++++++++++++++--------
 source/test/smoke-tests.txt |  35 +++++++++++++++--------------------
 2 files changed, 34 insertions(+), 28 deletions(-)

diffs (92 lines):

diff -r 8a37ebcf9232 -r ba495e2a54e5 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Mon Mar 16 17:04:21 2015 -0500
+++ b/source/common/x86/sad-a.asm	Mon Mar 16 19:28:14 2015 -0500
@@ -3898,9 +3898,11 @@ cglobal pixel_sad_x4_8x8, 7,7,5
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_32x32, 4,5,5
+cglobal pixel_sad_32x32, 4,7,5
     xorps           m0, m0
-    mov             r4d, 16
+    mov             r4d, 32/4
+    lea             r5, [r1 * 3]
+    lea             r6, [r3 * 3]
 
 .loop
     movu           m1, [r0]               ; row 0 of pix0
@@ -3913,11 +3915,21 @@ cglobal pixel_sad_32x32, 4,5,5
     paddd          m0, m1
     paddd          m0, m3
 
-    lea     r2,     [r2 + 2 * r3]
-    lea     r0,     [r0 + 2 * r1]
-
-    dec         r4d
-    jnz         .loop
+    movu           m1, [r0 + 2 * r1]      ; row 2 of pix0
+    movu           m2, [r2 + 2 * r3]      ; row 2 of pix1
+    movu           m3, [r0 + r5]          ; row 3 of pix0
+    movu           m4, [r2 + r6]          ; row 3 of pix1
+
+    psadbw         m1, m2
+    psadbw         m3, m4
+    paddd          m0, m1
+    paddd          m0, m3
+
+    lea            r2,     [r2 + 4 * r3]
+    lea            r0,     [r0 + 4 * r1]
+
+    dec            r4d
+    jnz           .loop
 
     vextracti128   xm1, m0, 1
     paddd          xm0, xm1
@@ -3926,5 +3938,4 @@ cglobal pixel_sad_32x32, 4,5,5
     movd            eax, xm0
     RET
 
-
 %endif
diff -r 8a37ebcf9232 -r ba495e2a54e5 source/test/smoke-tests.txt
--- a/source/test/smoke-tests.txt	Mon Mar 16 17:04:21 2015 -0500
+++ b/source/test/smoke-tests.txt	Mon Mar 16 19:28:14 2015 -0500
@@ -1,23 +1,18 @@
 # List of command lines to be run by smoke tests, see https://bitbucket.org/sborho/test-harness
 
 big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers
-big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000
-big_buck_bunny_360p24.y4m,--preset=slower
-washdc_422_ntsc.y4m,--preset=superfast
-washdc_422_ntsc.y4m,--preset=medium
-washdc_422_ntsc.y4m,--preset=slower
-old_town_cross_444_720p50.y4m,--preset=superfast
-old_town_cross_444_720p50.y4m,--preset=medium
-old_town_cross_444_720p50.y4m,--preset=slower
-crowd_run_1080p50.y4m,--preset=superfast
-crowd_run_1080p50.y4m,--preset=medium
-crowd_run_1080p50.y4m,--preset=slower
-RaceHorses_416x240_30_10bit.yuv,--preset=superfast
-RaceHorses_416x240_30_10bit.yuv,--preset=medium
-RaceHorses_416x240_30_10bit.yuv,--preset=slower
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=superfast
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium
-CrowdRun_1920x1080_50_10bit_444.yuv,--preset=slower
-DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=superfast
-DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=medium
-DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=slower
+big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 --cu-lossless --scaling-list default
+big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --cu-stats --pme
+washdc_422_ntsc.y4m,--preset=faster --strong-intra-smoothing  --keyint 1
+washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4
+washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip
+old_town_cross_444_720p50.y4m,--preset=ultrafast --weightp
+old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16
+old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode
+RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --cu-stats --max-tu-size 8
+RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 --rdoq-level 1
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --contrained-intra --min-keyint 5 --keyint 10
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16
+CrowdRun_1920x1080_50_10bit_444.yuv,--preset=slow --keyint -1 --rdoq-level 0
+DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=veryfast --min-cu 16
+DucksAndLegs_1920x1080_60_10bit_422.yuv, --preset=fast --weightb --interlace bff


More information about the x265-commits mailing list