[x265-commits] [x265] asm: interp_8tap_hv_pp_8x8() for Interpolate_HV_8x8

Tue Oct 29 08:44:04 CET 2013

details:   http://hg.videolan.org/x265/rev/31dfc1580bf2
branches:  
changeset: 4712:31dfc1580bf2
user:      Min Chen <chenm003 at 163.com>
date:      Mon Oct 28 22:22:11 2013 +0800
description:
asm: interp_8tap_hv_pp_8x8() for Interpolate_HV_8x8
Subject: [x265] disable interpolate horizontal merge

details:   http://hg.videolan.org/x265/rev/a36a2e39f983
branches:  
changeset: 4713:a36a2e39f983
user:      Min Chen <chenm003 at 163.com>
date:      Mon Oct 28 22:23:13 2013 +0800
description:
disable interpolate horizontal merge

we need width is multiple of 4 in asm code, the maskmovq is very expensive
Subject: [x265] replace pointer to coeff by coeffIdx in ipfilter_sp

details:   http://hg.videolan.org/x265/rev/f0eea23735a6
branches:  
changeset: 4714:f0eea23735a6
user:      Min Chen <chenm003 at 163.com>
date:      Mon Oct 28 22:23:29 2013 +0800
description:
replace pointer to coeff by coeffIdx in ipfilter_sp
Subject: [x265] asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]

details:   http://hg.videolan.org/x265/rev/44c38df44532
branches:  
changeset: 4715:44c38df44532
user:      Min Chen <chenm003 at 163.com>
date:      Tue Oct 29 12:48:02 2013 +0800
description:
asm: interp_8tap_v_sp for ipfilter_sp[FILTER_V_S_P_8]
Subject: [x265] use correct _WIN32 build guard

details:   http://hg.videolan.org/x265/rev/560bd09eb4bb
branches:  
changeset: 4716:560bd09eb4bb
user:      Steve Borho <steve at borho.org>
date:      Tue Oct 29 00:35:48 2013 -0500
description:
use correct _WIN32 build guard
Subject: [x265] vec: fix VC9 build with ASM disabled but intrinsics enabled

details:   http://hg.videolan.org/x265/rev/8846f5cf6d8d
branches:  
changeset: 4717:8846f5cf6d8d
user:      Steve Borho <steve at borho.org>
date:      Tue Oct 29 01:05:47 2013 -0500
description:
vec: fix VC9 build with ASM disabled but intrinsics enabled
Subject: [x265] asm: assembly code for pixel_sad_16x32

details:   http://hg.videolan.org/x265/rev/f44cc9f976cc
branches:  
changeset: 4718:f44cc9f976cc
user:      Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
date:      Tue Oct 29 11:27:02 2013 +0530
description:
asm: assembly code for pixel_sad_16x32
Subject: [x265] asm: assembly code for pixel_sad_16x64

details:   http://hg.videolan.org/x265/rev/3c0b386fe799
branches:  
changeset: 4719:3c0b386fe799
user:      Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
date:      Tue Oct 29 12:16:50 2013 +0530
description:
asm: assembly code for pixel_sad_16x64
Subject: [x265] primitives: fix ordering of LUMA_4x16 LUMA_16x4 to match other AMP partitions

details:   http://hg.videolan.org/x265/rev/4db0aec57138
branches:  
changeset: 4720:4db0aec57138
user:      Steve Borho <steve at borho.org>
date:      Tue Oct 29 02:33:03 2013 -0500
description:
primitives: fix ordering of LUMA_4x16 LUMA_16x4 to match other AMP partitions

This also fixes the testbench to properly represent 16x4 and 4x16 partition
primitives
Subject: [x265] Cleanups: Replacing Ushort with uint16_t

details:   http://hg.videolan.org/x265/rev/4c618e33c25f
branches:  
changeset: 4721:4c618e33c25f
user:      Murugan Vairavel <murugan at multicorewareinc.com>
date:      Tue Oct 29 11:16:32 2013 +0530
description:
Cleanups: Replacing Ushort with uint16_t

diffstat:

 source/Lib/TLibCommon/TComPrediction.cpp |    6 +-
 source/Lib/TLibCommon/TComTrQuant.cpp    |   26 +-
 source/Lib/TLibCommon/TComTrQuant.h      |   10 +-
 source/Lib/TLibCommon/TypeDef.h          |    3 +-
 source/common/ipfilter.cpp               |   18 +-
 source/common/primitives.h               |    6 +-
 source/common/threadpool.cpp             |    6 +-
 source/common/vec/ipfilter-sse41.cpp     |    6 +-
 source/common/vec/vec-primitives.cpp     |   20 ++
 source/common/x86/asm-primitives.cpp     |    6 +
 source/common/x86/ipfilter8.asm          |  279 ++++++++++++++++++++++++++++++-
 source/common/x86/ipfilter8.h            |    3 +
 source/common/x86/sad-a.asm              |   96 ++++++++++
 source/encoder/motion.cpp                |   42 ++-
 source/input/y4m.cpp                     |    4 +-
 source/input/yuv.cpp                     |    4 +-
 source/test/ipfilterharness.cpp          |   74 +++++++-
 source/test/ipfilterharness.h            |    1 +
 18 files changed, 548 insertions(+), 62 deletions(-)

diffs (truncated from 1031 to 300 lines):

diff -r 0666d56aaa42 -r 4c618e33c25f source/Lib/TLibCommon/TComPrediction.cpp

--- a/source/Lib/TLibCommon/TComPrediction.cpp	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp	Tue Oct 29 11:16:32 2013 +0530
@@ -499,7 +499,7 @@ void TComPrediction::xPredInterLumaBlk(T
         int filterSize = NTAPS_LUMA;
         int halfFilterSize = (filterSize >> 1);
         primitives.ipfilter_ps[FILTER_H_P_S_8](src - (halfFilterSize - 1) * srcStride,  srcStride, m_immedVals, tmpStride, width, height + filterSize - 1, g_lumaFilter[xFrac]);
-        primitives.ipfilter_sp[FILTER_V_S_P_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, g_lumaFilter[yFrac]);
+        primitives.ipfilter_sp[FILTER_V_S_P_8](m_immedVals + (halfFilterSize - 1) * tmpStride, tmpStride, dst, dstStride, width, height, yFrac);
     }
 }
 
@@ -590,10 +590,10 @@ void TComPrediction::xPredInterChromaBlk
         int halfFilterSize = (filterSize >> 1);
 
         primitives.ipfilter_ps[FILTER_H_P_S_4](refCb - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
-        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, cxWidth, cxHeight, yFrac);
 
         primitives.ipfilter_ps[FILTER_H_P_S_4](refCr - (halfFilterSize - 1) * refStride, refStride, m_immedVals, extStride, cxWidth, cxHeight + filterSize - 1, g_chromaFilter[xFrac]);
-        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, g_chromaFilter[yFrac]);
+        primitives.ipfilter_sp[FILTER_V_S_P_4](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, cxWidth, cxHeight, yFrac);
     }
 }
 
diff -r 0666d56aaa42 -r 4c618e33c25f source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp	Tue Oct 29 11:16:32 2013 +0530
@@ -640,7 +640,7 @@ uint32_t TComTrQuant::xRateDistOptQuant(
                 {
                     uint32_t   posY   = blkPos >> log2BlkSize;
                     uint32_t   posX   = blkPos - (posY << log2BlkSize);
-                    UShort ctxSig = getSigCtxInc(patternSigCtx, scanIdx, posX, posY, log2BlkSize, ttype);
+                    uint16_t ctxSig = getSigCtxInc(patternSigCtx, scanIdx, posX, posY, log2BlkSize, ttype);
                     level         = xGetCodedLevel(costCoeff[scanPos], costCoeff0[scanPos], costSig[scanPos],
                                                    levelDouble, maxAbsLevel, ctxSig, oneCtx, absCtx, goRiceParam,
                                                    c1Idx, c2Idx, qbits, scaleFactor, 0);
@@ -1149,10 +1149,10 @@ inline uint32_t TComTrQuant::xGetCodedLe
                                         double& codedCostSig,
                                         int     levelDouble,
                                         uint32_t    maxAbsLevel,
-                                        UShort  ctxNumSig,
-                                        UShort  ctxNumOne,
-                                        UShort  ctxNumAbs,
-                                        UShort  absGoRice,
+                                        uint16_t  ctxNumSig,
+                                        uint16_t  ctxNumOne,
+                                        uint16_t  ctxNumAbs,
+                                        uint16_t  absGoRice,
                                         uint32_t    c1Idx,
                                         uint32_t    c2Idx,
                                         int     qbits,
@@ -1207,9 +1207,9 @@ inline uint32_t TComTrQuant::xGetCodedLe
  * \returns cost of given absolute transform level
  */
 inline double TComTrQuant::xGetICRateCost(uint32_t   absLevel,
-                                          UShort ctxNumOne,
-                                          UShort ctxNumAbs,
-                                          UShort absGoRice,
+                                          uint16_t ctxNumOne,
+                                          uint16_t ctxNumAbs,
+                                          uint16_t absGoRice,
                                           uint32_t   c1Idx,
                                           uint32_t   c2Idx) const
 {
@@ -1263,9 +1263,9 @@ inline double TComTrQuant::xGetICRateCos
 }
 
 inline int TComTrQuant::xGetICRate(uint32_t   absLevel,
-                                   UShort ctxNumOne,
-                                   UShort ctxNumAbs,
-                                   UShort absGoRice,
+                                   uint16_t ctxNumOne,
+                                   uint16_t ctxNumAbs,
+                                   uint16_t absGoRice,
                                    uint32_t   c1Idx,
                                    uint32_t   c2Idx) const
 {
@@ -1290,8 +1290,8 @@ inline int TComTrQuant::xGetICRate(uint3
             symbol = std::min<uint32_t>(symbol, (maxVlc + 1));
         }
 
-        UShort prefLen = UShort(symbol >> absGoRice) + 1;
-        UShort numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
+        uint16_t prefLen = uint16_t(symbol >> absGoRice) + 1;
+        uint16_t numBins = std::min<uint32_t>(prefLen, g_goRicePrefixLen[absGoRice]) + absGoRice;
 
         rate += numBins << 15;
 
diff -r 0666d56aaa42 -r 4c618e33c25f source/Lib/TLibCommon/TComTrQuant.h
--- a/source/Lib/TLibCommon/TComTrQuant.h	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/Lib/TLibCommon/TComTrQuant.h	Tue Oct 29 11:16:32 2013 +0530
@@ -200,18 +200,18 @@ private:
     uint32_t xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, TCoeff* dstCoeff, uint32_t width, uint32_t height, TextType ttype, uint32_t absPartIdx, int32_t *lastPos);
 
     inline uint32_t xGetCodedLevel(double& codedCost, double& codedCost0, double& codedCostSig, int levelDouble,
-                               uint32_t maxAbsLevel, UShort ctxNumSig, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice,
+                               uint32_t maxAbsLevel, uint16_t ctxNumSig, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice,
                                uint32_t c1Idx, uint32_t c2Idx, int qbits, double scale, bool bLast) const;
 
-    inline double xGetICRateCost(uint32_t absLevel, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+    inline double xGetICRateCost(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
 
-    inline int    xGetICRate(uint32_t absLevel, UShort ctxNumOne, UShort ctxNumAbs, UShort absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
+    inline int    xGetICRate(uint32_t absLevel, uint16_t ctxNumOne, uint16_t ctxNumAbs, uint16_t absGoRice, uint32_t c1Idx, uint32_t c2Idx) const;
 
     inline double xGetRateLast(uint32_t posx, uint32_t posy) const;
 
-    inline double xGetRateSigCoeffGroup(UShort sigCoeffGroup, UShort ctxNumSig) const { return m_lambda * m_estBitsSbac->significantCoeffGroupBits[ctxNumSig][sigCoeffGroup]; }
+    inline double xGetRateSigCoeffGroup(uint16_t sigCoeffGroup, uint16_t ctxNumSig) const { return m_lambda * m_estBitsSbac->significantCoeffGroupBits[ctxNumSig][sigCoeffGroup]; }
 
-    inline double xGetRateSigCoef(UShort sig, UShort ctxNumSig) const { return m_lambda * m_estBitsSbac->significantBits[ctxNumSig][sig]; }
+    inline double xGetRateSigCoef(uint16_t sig, uint16_t ctxNumSig) const { return m_lambda * m_estBitsSbac->significantBits[ctxNumSig][sig]; }
 
     inline double xGetICost(double rage) const { return m_lambda * rage; } ///< Get the cost for a specific rate
 
diff -r 0666d56aaa42 -r 4c618e33c25f source/Lib/TLibCommon/TypeDef.h
--- a/source/Lib/TLibCommon/TypeDef.h	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/Lib/TLibCommon/TypeDef.h	Tue Oct 29 11:16:32 2013 +0530
@@ -52,7 +52,6 @@ namespace x265 {
 // ====================================================================================================================
 
 typedef unsigned char  UChar;
-typedef unsigned short UShort;
 
 // ====================================================================================================================
 // 64-bit integer type
@@ -71,7 +70,7 @@ typedef unsigned long long  UInt64;
 // ====================================================================================================================
 
 #if HIGH_BIT_DEPTH
-typedef UShort Pel;            // 16-bit pixel type
+typedef uint16_t Pel;            // 16-bit pixel type
 #define X265_DEPTH x265::g_bitDepth  // runtime configurable bit depth
 extern int g_bitDepth;
 #else
diff -r 0666d56aaa42 -r 4c618e33c25f source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/ipfilter.cpp	Tue Oct 29 11:16:32 2013 +0530
@@ -37,12 +37,14 @@ using namespace x265;
 
 namespace {
 template<int N>
-void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
+void filterVertical_sp_c(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
 {
     int headRoom = IF_INTERNAL_PREC - X265_DEPTH;
     int shift = IF_FILTER_PREC + headRoom;
     int offset = (1 << (shift - 1)) + (IF_INTERNAL_OFFS << IF_FILTER_PREC);
     int16_t maxVal = (1 << X265_DEPTH) - 1;
+    const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+
     src -= (N / 2 - 1) * srcStride;
 
     int row, col;
@@ -401,6 +403,17 @@ void interp_vert_pp_c(pixel *src, intptr
         dst += dstStride;
     }
 }
+typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, short *dst, intptr_t dstStride, int width, int height, const short *coeff);
+typedef void (*ipfilter_sp_t)(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const short *coeff);
+
+template<int N, int width, int height>
+void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+{
+    short m_immedVals[(64 + 8) * (64 + 8)];
+    filterHorizontal_ps_c<N>(src - 3 * srcStride, srcStride, m_immedVals, width, width, height + 7, g_lumaFilter[idxX]);
+    filterVertical_sp_c<N>(m_immedVals + 3 * width, width, dst, dstStride, width, height, idxY);
+}
+
 }
 
 namespace x265 {
@@ -412,7 +425,8 @@ namespace x265 {
 
 #define LUMA(W, H) \
     p.luma_hpp[LUMA_ ## W ## x ## H]     = interp_horiz_pp_c<8, W, H>;\
-    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>
+    p.luma_vpp[LUMA_ ## W ## x ## H]     = interp_vert_pp_c<8, W, H>; \
+    p.luma_hvpp[LUMA_ ## W ## x ## H]    = interp_hv_pp_c<8, W, H>;
 
 void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
 {
diff -r 0666d56aaa42 -r 4c618e33c25f source/common/primitives.h
--- a/source/common/primitives.h	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/primitives.h	Tue Oct 29 11:16:32 2013 +0530
@@ -66,7 +66,7 @@ enum LumaPartitions
 { // Square     Rectangular             Asymmetrical (0.75, 0.25)
     LUMA_4x4,
     LUMA_8x8,   LUMA_8x4,   LUMA_4x8,
-    LUMA_16x16, LUMA_16x8,  LUMA_8x16,  LUMA_16x12, LUMA_12x16, LUMA_4x16,  LUMA_16x4,
+    LUMA_16x16, LUMA_16x8,  LUMA_8x16,  LUMA_16x12, LUMA_12x16, LUMA_16x4,  LUMA_4x16,
     LUMA_32x32, LUMA_32x16, LUMA_16x32, LUMA_32x24, LUMA_24x32, LUMA_32x8,  LUMA_8x32,
     LUMA_64x64, LUMA_64x32, LUMA_32x64, LUMA_64x48, LUMA_48x64, LUMA_64x16, LUMA_16x64,
     NUM_LUMA_PARTITIONS
@@ -165,7 +165,7 @@ typedef void (*pixelcmp_x4_t)(pixel *fen
 typedef void (*pixelcmp_x3_t)(pixel *fenc, pixel *fref0, pixel *fref1, pixel *fref2, intptr_t frefstride, int32_t *res);
 typedef void (*ipfilter_pp_t)(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
 typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
-typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
+typedef void (*ipfilter_sp_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const int coeffIdx);
 typedef void (*ipfilter_ss_t)(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height, const int16_t *coeff);
 typedef void (*ipfilter_p2s_t)(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int width, int height);
 typedef void (*ipfilter_s2p_t)(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height);
@@ -209,6 +209,7 @@ typedef uint64_t (*var_t)(pixel *pix, in
 typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src,  intptr_t srcStride, int w, int h);
 
 typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
 
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
@@ -245,6 +246,7 @@ struct EncoderPrimitives
     filter_pp_t     luma_hpp[NUM_LUMA_PARTITIONS];
     filter_pp_t     chroma_vpp[NUM_CHROMA_PARTITIONS];
     filter_pp_t     luma_vpp[NUM_LUMA_PARTITIONS];
+    filter_hv_pp_t  luma_hvpp[NUM_LUMA_PARTITIONS];
 
     intra_dc_t      intra_pred_dc;
     intra_planar_t  intra_pred_planar;
diff -r 0666d56aaa42 -r 4c618e33c25f source/common/threadpool.cpp
--- a/source/common/threadpool.cpp	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/threadpool.cpp	Tue Oct 29 11:16:32 2013 +0530
@@ -369,7 +369,7 @@ void JobProvider::dequeue()
 
 static int get_cpu_count()
 {
-#if WIN32
+#if _WIN32
     SYSTEM_INFO sysinfo;
     GetSystemInfo(&sysinfo);
     return sysinfo.dwNumberOfProcessors;
@@ -393,8 +393,8 @@ static int get_cpu_count()
     }
 
     return count;
-#else // if WIN32
+#else // if _WIN32
     return 2; // default to 2 threads, everywhere else
-#endif // if WIN32
+#endif // if _WIN32
 }
 } // end namespace x265
diff -r 0666d56aaa42 -r 4c618e33c25f source/common/vec/ipfilter-sse41.cpp
--- a/source/common/vec/ipfilter-sse41.cpp	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/vec/ipfilter-sse41.cpp	Tue Oct 29 11:16:32 2013 +0530
@@ -34,6 +34,8 @@
 #include <assert.h>
 #include <string.h>
 
+using namespace x265;
+
 #if !HIGH_BIT_DEPTH
 namespace {
 ALIGN_VAR_32(const uint16_t, c_512[16]) =
@@ -42,8 +44,10 @@ ALIGN_VAR_32(const uint16_t, c_512[16]) 
 };
 
 template<int N>
-void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int16_t const *coeff)
+void filterVertical_sp(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int coeffIdx)
 {
+    const int16_t *coeff = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
+
     src -= (N / 2 - 1) * srcStride;
 
     int offset;
diff -r 0666d56aaa42 -r 4c618e33c25f source/common/vec/vec-primitives.cpp
--- a/source/common/vec/vec-primitives.cpp	Mon Oct 28 16:13:05 2013 +0530
+++ b/source/common/vec/vec-primitives.cpp	Tue Oct 29 11:16:32 2013 +0530
@@ -43,10 +43,30 @@ void x265_cpu_cpuid(uint32_t op, uint32_
     *edx = output[3];
 }
 
+#if defined(_MSC_VER)
+#pragma warning(disable: 4100)
+#endif
 void x265_cpu_xgetbv(uint32_t op, uint32_t *eax, uint32_t *edx)
 {
+#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1200)
+
+    // MSVC 2010 SP1 or later, or similar Intel release
     uint64_t out = _xgetbv(op);
 
+#elif defined(__GNUC__)    // use inline assembly, Gnu/AT&T syntax
+
+    uint32_t a, d;
+    __asm("xgetbv" : "=a" (a), "=d" (d) : "c" (ctr) :);
+    *eax = a;
+    *edx = d;
+    return;
+
+#elif defined(_WIN64)      // On x64 with older compilers, this is impossible
+
+    uint64_t out = 0;
+
+#endif
+
     *eax = (uint32_t)out;
     *edx = (uint32_t)(out >> 32);