[x265] [PATCH] saoCuOrgB0: asm code
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Mon Jan 5 09:12:43 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1420445547 -19800
# Node ID cbdd7a2c3bf6d00a6b5379fedd0fb4de778e0a30
# Parent ede9aa255489fe410aced728880765a7b975b10e
saoCuOrgB0: asm code
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/loopfilter.cpp Mon Jan 05 13:42:27 2015 +0530
@@ -57,10 +57,31 @@
}
}
+void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
+{
+ #define SAO_BO_BITS 5
+ const int boShift = X265_DEPTH - SAO_BO_BITS;
+ int x, y;
+ for (y = 0; y < ctuHeight; y++)
+ {
+ for (x = 0; x < ctuWidth; x++)
+ {
+ int val = rec[x] + offset[rec[x] >> boShift];
+ if (val < 0)
+ val = 0;
+ else if (val > ((1 << X265_DEPTH) - 1))
+ val = ((1 << X265_DEPTH) - 1);
+ rec[x] = (pixel)val;
+ }
+ rec += stride;
+ }
+}
+
namespace x265 {
void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
{
p.saoCuOrgE0 = processSaoCUE0;
+ p.saoCuOrgB0 = processSaoCUB0;
p.sign = calSign;
}
}
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/primitives.h
--- a/source/common/primitives.h Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/primitives.h Mon Jan 05 13:42:27 2015 +0530
@@ -191,6 +191,7 @@
typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
@@ -243,6 +244,7 @@
sign_t sign;
saoCuOrgE0_t saoCuOrgE0;
+ saoCuOrgB0_t saoCuOrgB0;
downscale_t frameInitLowres;
cutree_propagate_cost propagateCost;
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 05 13:42:27 2015 +0530
@@ -1648,6 +1648,7 @@
{
p.sign = x265_calSign_sse4;
p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
+ p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/x86/loopfilter.asm Mon Jan 05 13:42:27 2015 +0530
@@ -28,7 +28,8 @@
%include "x86inc.asm"
SECTION_RODATA 32
-
+pb_31: times 16 db 31
+pb_15: times 16 db 15
SECTION .text
cextern pb_1
@@ -84,6 +85,56 @@
jnz .loop
RET
+;=====================================================================================
+; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
+;=====================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgB0, 4, 7, 8
+
+ mov r3d, r3m
+ mov r4d, r4m
+
+ shr r2d, 4
+ movu m3, [r1 + 0] ; offset[0-15]
+ movu m4, [r1 + 16] ; offset[16-31]
+ pxor m7, m7 ; m7 =[0]
+.loopH
+ mov r5d, r2d
+ xor r6, r6
+
+.loopW
+ movu m2, [r0 + r6] ; m0 = [rec]
+ psrlw m1, m2, 3
+ pand m1, [pb_31] ; m1 = [index]
+ pcmpgtb m0, m1, [pb_15] ; m2 = [mask]
+
+ pshufb m6, m3, m1
+ pshufb m5, m4, m1
+
+ pblendvb m6, m6, m5, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+
+ pmovzxbw m1, m2 ; rec
+ punpckhbw m2, m7
+
+ pmovsxbw m0, m6 ; offset
+ punpckhbw m6, m6
+ psraw m6, 8
+
+ paddw m1, m0
+ paddw m2, m6
+ packuswb m1, m2
+
+ movu [r0 + r6], m1
+ add r6d, 16
+ dec r5d
+ jnz .loopW
+
+ lea r0, [r0 + r4]
+
+ dec r3d
+ jnz .loopH
+ RET
+
;============================================================================================================
; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int endX)
;============================================================================================================
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/x86/loopfilter.h Mon Jan 05 13:42:27 2015 +0530
@@ -2,6 +2,7 @@
* Copyright (C) 2013 x265 project
*
* Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -25,6 +26,7 @@
#define X265_LOOPFILTER_H
void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
#endif // ifndef X265_LOOPFILTER_H
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Mon Jan 05 10:57:01 2015 +0530
+++ b/source/encoder/sao.cpp Mon Jan 05 13:42:27 2015 +0530
@@ -3,6 +3,7 @@
*
* Authors: Steve Borho <steve at borho.org>
* Min Chen <chenm003 at 163.com>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -73,7 +74,6 @@
m_param = NULL;
m_clipTable = NULL;
m_clipTableBase = NULL;
- m_offsetBo = NULL;
m_tmpU1[0] = NULL;
m_tmpU1[1] = NULL;
m_tmpU1[2] = NULL;
@@ -107,7 +107,6 @@
int numCtu = m_numCuInWidth * m_numCuInHeight;
CHECKED_MALLOC(m_clipTableBase, pixel, maxY + 2 * rangeExt);
- CHECKED_MALLOC(m_offsetBo, pixel, maxY + 2 * rangeExt);
CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
@@ -145,7 +144,6 @@
void SAO::destroy()
{
X265_FREE(m_clipTableBase);
- X265_FREE(m_offsetBo);
X265_FREE(m_tmpL1);
X265_FREE(m_tmpL2);
@@ -443,16 +441,31 @@
}
case SAO_BO:
{
- const pixel* offsetBo = m_offsetBo;
+ const int8_t* offsetBo = m_offsetBo;
- for (y = 0; y < ctuHeight; y++)
+ if (ctuWidth & 15)
{
- for (x = 0; x < ctuWidth; x++)
- rec[x] = offsetBo[rec[x]];
-
- rec += stride;
+ #define SAO_BO_BITS 5
+ const int boShift = X265_DEPTH - SAO_BO_BITS;
+ int x, y;
+ for (y = 0; y < ctuHeight; y++)
+ {
+ for (x = 0; x < ctuWidth; x++)
+ {
+ int val = rec[x] + offsetBo[rec[x] >> boShift];
+ if (val < 0)
+ val = 0;
+ else if (val > ((1 << X265_DEPTH) - 1))
+ val = ((1 << X265_DEPTH) - 1);
+ rec[x] = (pixel)val;
+ }
+ rec += stride;
+ }
}
-
+ else
+ {
+ primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
+ }
break;
}
default: break;
@@ -495,8 +508,6 @@
memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
- const int boShift = X265_DEPTH - SAO_BO_BITS;
-
for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
{
addr = idxY * m_numCuInWidth + idxX;
@@ -510,15 +521,10 @@
{
if (typeIdx == SAO_BO)
{
- pixel* offsetBo = m_offsetBo;
- int offset[SAO_NUM_BO_CLASSES];
- memset(offset, 0, sizeof(offset));
+ memset(m_offsetBo, 0, sizeof(m_offsetBo));
for (int i = 0; i < SAO_NUM_OFFSET; i++)
- offset[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC;
-
- for (int i = 0; i < (1 << X265_DEPTH); i++)
- offsetBo[i] = m_clipTable[i + offset[i >> boShift]];
+ m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
}
else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
{
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/encoder/sao.h
--- a/source/encoder/sao.h Mon Jan 05 10:57:01 2015 +0530
+++ b/source/encoder/sao.h Mon Jan 05 13:42:27 2015 +0530
@@ -3,6 +3,7 @@
*
* Authors: Steve Borho <steve at borho.org>
* Min Chen <chenm003 at 163.com>
+ * Praveen Kumar Tiwari <praveen at multicorewareinc.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -77,7 +78,7 @@
PerPlane* m_offsetOrgPreDblk;
double m_depthSaoRate[2][4];
- pixel* m_offsetBo;
+ int8_t m_offsetBo[SAO_NUM_BO_CLASSES];
int8_t m_offsetEo[NUM_EDGETYPE];
int m_numCuInWidth;
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jan 05 10:57:01 2015 +0530
+++ b/source/test/pixelharness.cpp Mon Jan 05 13:42:27 2015 +0530
@@ -997,6 +997,36 @@
return true;
}
+bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int width = 16 * (rand() % 4 + 1);
+ int height = rand() % 64 +1;
+ int stride = rand() % 65;
+
+ ref(ref_dest, psbuf1 + j, width, height, stride);
+ checked(opt, opt_dest, psbuf1 + j, width, height, stride);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
+
bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.satd[part])
@@ -1431,6 +1461,15 @@
}
}
+ if (opt.saoCuOrgB0)
+ {
+ if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
+ {
+ printf("SAO_BO_0 failed\n");
+ return false;
+ }
+ }
+
if (opt.planecopy_sp)
{
if (!check_planecopy_sp(ref.planecopy_sp, opt.planecopy_sp))
@@ -1760,6 +1799,12 @@
REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
}
+ if (opt.saoCuOrgB0)
+ {
+ HEADER0("SAO_BO_0");
+ REPORT_SPEEDUP(opt.saoCuOrgB0, ref.saoCuOrgB0, pbuf1, psbuf1, 64, 64, 64);
+ }
+
if (opt.planecopy_sp)
{
HEADER0("planecopy_sp");
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Jan 05 10:57:01 2015 +0530
+++ b/source/test/pixelharness.h Mon Jan 05 13:42:27 2015 +0530
@@ -90,6 +90,7 @@
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
bool check_addAvg(addAvg_t, addAvg_t);
bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
+ bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
More information about the x265-devel
mailing list