Patchwork [v2,11/17] x86emul: support XOP insns

login
register
mail settings
Submitter Jan Beulich
Date Sept. 14, 2017, 3:18 p.m.
Message ID <59BAB9CF020000780017B3A5@prv-mh.provo.novell.com>
Download mbox | patch
Permalink /patch/338311/
State New
Headers show

Comments

Jan Beulich - Sept. 14, 2017, 3:18 p.m.
Signed-off-by: Jan Beulich <jbeulich@suse.com>

Patch

--- a/.gitignore
+++ b/.gitignore
@@ -230,6 +230,7 @@ 
 tools/tests/x86_emulator/sse*.[ch]
 tools/tests/x86_emulator/test_x86_emulator
 tools/tests/x86_emulator/x86_emulate
+tools/tests/x86_emulator/xop*.[ch]
 tools/tests/xen-access/xen-access
 tools/tests/xenstore/xs-test
 tools/tests/regression/installed/*
--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -11,7 +11,7 @@  all: $(TARGET)
 run: $(TARGET)
 	./$(TARGET)
 
-SIMD := sse sse2 sse4 avx avx2
+SIMD := sse sse2 sse4 avx avx2 xop
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -44,6 +44,9 @@  avx2-sg-vecs := $(avx2-vecs)
 avx2-sg-idxs := 4 8
 avx2-sg-ints := 4 8
 avx2-sg-flts := 4 8
+xop-vecs := $(avx-vecs)
+xop-ints := 1 2 4 8
+xop-flts := $(avx-flts)
 
 # For AVX and later, have the compiler avoid XMM0 to widen coverage of
 # the VEX.vvvv checks in the emulator.
@@ -98,6 +101,8 @@  $(addsuffix .c,$(SG)):
 
 $(addsuffix .o,$(SIMD) $(FMA) $(SG)): simd.h
 
+xop.o: simd-fma.c
+
 $(TARGET): x86_emulate.o test_x86_emulator.o
 	$(HOSTCC) -o $@ $^
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -471,6 +471,86 @@  static inline bool _to_bool(byte_vec_t b
 #  endif
 # endif
 #endif
+#ifdef __XOP__
+# undef select
+# if VEC_SIZE == 16
+#  if INT_SIZE == 2 || INT_SIZE == 4
+#   include "simd-fma.c"
+#  endif
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if INT_SIZE == 1 || UINT_SIZE == 1
+#   define swap2(x) ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), (vqi_t)inv - 1))
+#  elif INT_SIZE == 2 || UINT_SIZE == 2
+#   define swap2(x) \
+    ((vec_t)__builtin_ia32_vpperm((vqi_t)(x), (vqi_t)(x), \
+                                  (vqi_t)(__builtin_ia32_vprotwi(2 * (vhi_t)inv - 1, 8) | \
+                                          (2 * inv - 2))))
+#  elif FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2ps((vec_t){}, x, __builtin_ia32_cvtps2dq(inv) + 3, 0) */ \
+    vec_t t_; \
+    asm ( "vpermil2ps $0, %3, %2, %1, %0" : \
+          "=x" (t_) : \
+          "x" ((vec_t){}), "m" (x), "x" (__builtin_ia32_cvtps2dq(inv) + 3) ); \
+    t_; \
+})
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd(x)
+#   undef swap2
+#   define swap2(x) ({ \
+    /* Buggy in gcc 7.1.0 and earlier. */ \
+    /* __builtin_ia32_vpermil2pd((vec_t){}, x, */ \
+    /*                            __builtin_ia32_pmovsxdq128( */ \
+    /*                                __builtin_ia32_cvtpd2dq(inv) + 1) << 1, 0) */ \
+    vdi_t s_ = __builtin_ia32_pmovsxdq128( \
+                   __builtin_ia32_cvtpd2dq(inv) + 1) << 1; \
+    vec_t t_; \
+    asm ( "vpermil2pd $0, %3, %2, %1, %0" : \
+          "=x" (t_) : "x" ((vec_t){}), "x" (x), "m" (s_) ); \
+    t_; \
+})
+#  endif
+#  if INT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphaddbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddbw((vqi_t)(y))))
+#   define hsub(x, y) ((vec_t)__builtin_ia32_packsswb128(__builtin_ia32_vphsubbw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphsubbw((vqi_t)(y))))
+#  elif UINT_SIZE == 1
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packuswb128(__builtin_ia32_vphaddubw((vqi_t)(x)), \
+                                                         __builtin_ia32_vphaddubw((vqi_t)(y))))
+#  elif INT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphaddwd(x), \
+                                                 __builtin_ia32_vphaddwd(y))
+#   undef hsub
+#   define hsub(x, y) __builtin_ia32_packssdw128(__builtin_ia32_vphsubwd(x), \
+                                                 __builtin_ia32_vphsubwd(y))
+#  elif UINT_SIZE == 2
+#   undef hadd
+#   define hadd(x, y) ((vec_t)__builtin_ia32_packusdw128(__builtin_ia32_vphadduwd((vhi_t)(x)), \
+                                                         __builtin_ia32_vphadduwd((vhi_t)(y))))
+#   undef hsub
+#  endif
+# elif VEC_SIZE == 32
+#  define select(d, x, y, m) \
+    (*(d) = (vec_t)__builtin_ia32_vpcmov256((vdi_t)(x), (vdi_t)(y), (vdi_t)(m)))
+#  if FLOAT_SIZE == 4
+#   define frac(x) __builtin_ia32_vfrczps256(x)
+#  elif FLOAT_SIZE == 8
+#   define frac(x) __builtin_ia32_vfrczpd256(x)
+#  endif
+# elif VEC_SIZE == FLOAT_SIZE
+#  if VEC_SIZE == 4
+#   define frac(x) scalar_1op(x, "vfrczss %[in], %[out]")
+#  elif VEC_SIZE == 8
+#   define frac(x) scalar_1op(x, "vfrczsd %[in], %[out]")
+#  endif
+# endif
+#endif
 
 int simd_test(void)
 {
@@ -576,6 +656,29 @@  int simd_test(void)
     if ( !to_bool(y == z) ) return __LINE__;
 # endif
 
+# ifdef frac
+    touch(src);
+    x = frac(src);
+    touch(src);
+    if ( !to_bool(x == 0) ) return __LINE__;
+
+    x = 1 / (src + 1);
+    touch(x);
+    y = frac(x);
+    touch(x);
+    if ( !to_bool(x == y) ) return __LINE__;
+# endif
+
+# if defined(trunc) && defined(frac)
+    x = src / 4;
+    touch(x);
+    y = trunc(x);
+    touch(x);
+    z = frac(x);
+    touch(x);
+    if ( !to_bool(x == y + z) ) return __LINE__;
+# endif
+
 #else
 
 # if ELEM_SIZE > 1
@@ -677,7 +780,7 @@  int simd_test(void)
     y = z << sh;
     if ( !to_bool(x == y + y) ) return __LINE__;
 
-#  if defined(__AVX2__) && ELEM_SIZE >= 4
+#  if (defined(__AVX2__) && ELEM_SIZE >= 4) || defined(__XOP__)
     touch(sh);
     x = y >> sh;
     if ( !to_bool(x == z) ) return __LINE__;
@@ -871,6 +974,8 @@  int simd_test(void)
 #endif
 
 #ifdef hadd
+# if (!defined(INT_SIZE) || INT_SIZE > 1 || ELEM_COUNT < 16) && \
+     (!defined(UINT_SIZE) || UINT_SIZE > 1 || ELEM_COUNT <= 16)
     x = src;
     for ( i = ELEM_COUNT; i >>= 1; )
     {
@@ -878,6 +983,7 @@  int simd_test(void)
         x = hadd((vec_t){}, x);
     }
     if ( x[ELEM_COUNT - 1] != (ELEM_COUNT * (ELEM_COUNT + 1)) / 2 ) return __LINE__;
+# endif
 
 # ifdef hsub
     touch(src);
@@ -889,6 +995,9 @@  int simd_test(void)
 # endif
 #endif
 
+#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+    return -fma_test();
+#endif
 
     return 0;
 }
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -1,6 +1,8 @@ 
 #include "simd.h"
 
+#ifndef __XOP__
 ENTRY(fma_test);
+#endif
 
 #if VEC_SIZE < 16
 # define to_bool(cmp) (!~(cmp)[0])
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -13,6 +13,7 @@ 
 #include "fma.h"
 #include "avx2.h"
 #include "avx2-sg.h"
+#include "xop.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -63,6 +64,11 @@  static bool simd_check_avx2(void)
 }
 #define simd_check_avx2_sg simd_check_avx2
 
+static bool simd_check_xop(void)
+{
+    return cpu_has_xop;
+}
+
 static void simd_set_regs(struct cpu_user_regs *regs)
 {
     if ( cpu_has_mmx )
@@ -191,6 +197,22 @@  static const struct {
     SIMD(AVX2 S/G i64[4x32],  avx2_sg,    32x4i8),
     SIMD(AVX2 S/G i32[4x64],  avx2_sg,    32x8i4),
     SIMD(AVX2 S/G i64[4x64],  avx2_sg,    32x8i8),
+    SIMD(XOP 128bit single,       xop,      16f4),
+    SIMD(XOP 256bit single,       xop,      32f4),
+    SIMD(XOP 128bit double,       xop,      16f8),
+    SIMD(XOP 256bit double,       xop,      32f8),
+    SIMD(XOP s8x16,               xop,      16i1),
+    SIMD(XOP u8x16,               xop,      16u1),
+    SIMD(XOP s16x8,               xop,      16i2),
+    SIMD(XOP u16x8,               xop,      16u2),
+    SIMD(XOP s32x4,               xop,      16i4),
+    SIMD(XOP u32x4,               xop,      16u4),
+    SIMD(XOP s64x2,               xop,      16i8),
+    SIMD(XOP u64x2,               xop,      16u8),
+    SIMD(XOP i8x32,               xop,      32i1),
+    SIMD(XOP i16x16,              xop,      32i2),
+    SIMD(XOP i32x8,               xop,      32i4),
+    SIMD(XOP i64x4,               xop,      32i8),
 #undef SIMD_
 #undef SIMD
 };
--- a/tools/tests/x86_emulator/x86_emulate.h
+++ b/tools/tests/x86_emulator/x86_emulate.h
@@ -172,6 +172,16 @@  static inline uint64_t xgetbv(uint32_t x
     (res.c & (1U << 6)) != 0; \
 })
 
+#define cpu_has_xop ({ \
+    struct cpuid_leaf res; \
+    emul_test_cpuid(1, 0, &res, NULL); \
+    if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+        res.c = 0; \
+    else \
+        emul_test_cpuid(0x80000001, 0, &res, NULL); \
+    (res.c & (1U << 11)) != 0; \
+})
+
 #define cpu_has_fma4 ({ \
     struct cpuid_leaf res; \
     emul_test_cpuid(1, 0, &res, NULL); \
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -435,6 +435,7 @@  static const struct {
     [0x42] = { .simd_size = simd_packed_int },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
+    [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4a ... 0x4b] = { .simd_size = simd_packed_fp, .four_op = 1 },
     [0x4c] = { .simd_size = simd_packed_int, .four_op = 1 },
     [0x5c ... 0x5f] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -463,6 +464,17 @@  static const struct {
     uint8_t two_op:1;
     uint8_t four_op:1;
 } ext8f08_table[256] = {
+    [0xa2] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x85 ... 0x87] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x8e ... 0x8f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x95 ... 0x97] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0x9e ... 0x9f] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa3] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xa6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xb6] = { .simd_size = simd_packed_int, .four_op = 1 },
+    [0xc0 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcc ... 0xcf] = { .simd_size = simd_packed_int },
+    [0xec ... 0xef] = { .simd_size = simd_packed_int },
 };
 
 static const struct {
@@ -470,6 +482,16 @@  static const struct {
     uint8_t two_op:1;
 } ext8f09_table[256] = {
     [0x01 ... 0x02] = { .two_op = 1 },
+    [0x80 ... 0x81] = { .simd_size = simd_packed_fp, .two_op = 1 },
+    [0x82 ... 0x83] = { .simd_size = simd_scalar_fp, .two_op = 1 },
+    [0x90 ... 0x9b] = { .simd_size = simd_packed_int },
+    [0xc1 ... 0xc3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xc6 ... 0xc7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xcb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd1 ... 0xd3] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xd6 ... 0xd7] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
+    [0xe1 ... 0xe3] = { .simd_size = simd_packed_int, .two_op = 1 },
 };
 
 #define REX_PREFIX 0x40
@@ -528,7 +550,7 @@  union vex {
 #define copy_VEX(ptr, vex) ({ \
     if ( !mode_64bit() ) \
         (vex).reg |= 8; \
-    (ptr)[0 - PFX_BYTES] = 0xc4; \
+    (ptr)[0 - PFX_BYTES] = ext < ext_8f08 ? 0xc4 : 0x8f; \
     (ptr)[1 - PFX_BYTES] = (vex).raw[0]; \
     (ptr)[2 - PFX_BYTES] = (vex).raw[1]; \
     container_of((ptr) + 1 - PFX_BYTES, typeof(vex), raw[0]); \
@@ -1653,6 +1675,7 @@  static bool vcpu_has(
 #define vcpu_has_lzcnt()       vcpu_has(0x80000001, ECX,  5, ctxt, ops)
 #define vcpu_has_sse4a()       vcpu_has(0x80000001, ECX,  6, ctxt, ops)
 #define vcpu_has_misalignsse() vcpu_has(0x80000001, ECX,  7, ctxt, ops)
+#define vcpu_has_xop()         vcpu_has(0x80000001, ECX, 12, ctxt, ops)
 #define vcpu_has_fma4()        vcpu_has(0x80000001, ECX, 16, ctxt, ops)
 #define vcpu_has_tbm()         vcpu_has(0x80000001, ECX, 21, ctxt, ops)
 #define vcpu_has_bmi1()        vcpu_has(         7, EBX,  3, ctxt, ops)
@@ -2993,9 +3016,19 @@  x86_decode(
     case simd_packed_int:
         switch ( vex.pfx )
         {
-        case vex_none: op_bytes = 8;           break;
-        case vex_66:   op_bytes = 16 << vex.l; break;
-        default:       op_bytes = 0;           break;
+        case vex_none:
+            if ( !vex.opcx )
+            {
+                op_bytes = 8;
+                break;
+            }
+            /* fall through */
+        case vex_66:
+            op_bytes = 16 << vex.l;
+            break;
+        default:
+            op_bytes = 0;
+            break;
         }
         break;
 
@@ -8018,6 +8051,13 @@  x86_emulate(
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_imm8_avx;
 
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x48): /* vpermil2ps $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2ps $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+    case X86EMUL_OPC_VEX_66(0x0f3a, 0x49): /* vpermil2pd $imm,{x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                           /* vpermil2pd $imm,{x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_VEX_66(0x0f3a, 0x4c): /* vpblendvb {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
         generate_exception_if(vex.w, EXC_UD);
         goto simd_0f_int_imm8;
@@ -8155,6 +8195,41 @@  x86_emulate(
             asm ( "rorl %b1,%k0" : "=g" (dst.val) : "c" (imm1), "0" (src.val) );
         break;
 
+    case X86EMUL_OPC_XOP(08, 0x85): /* vpmacssww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x86): /* vpmacsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x87): /* vpmacssdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8e): /* vpmacssdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x8f): /* vpmacssdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x95): /* vpmacsww xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x96): /* vpmacswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x97): /* vpmacsdql xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9e): /* vpmacsdd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0x9f): /* vpmacsdqh xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xa6): /* vpmadcsswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xb6): /* vpmadcswd xmm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc0): /* vprotb $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc1): /* vprotw $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc2): /* vprotd $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xc3): /* vprotq $imm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcc): /* vpcomb $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcd): /* vpcomw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xce): /* vpcomd $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xcf): /* vpcomq $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xec): /* vpcomub $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xed): /* vpcomuw $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xee): /* vpcomud $imm,xmm/m128,xmm,xmm */
+    case X86EMUL_OPC_XOP(08, 0xef): /* vpcomuq $imm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa3): /* vpperm xmm/m128,xmm,xmm,xmm */
+                                    /* vpperm xmm,xmm/m128,xmm,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(08, 0xa2): /* vpcmov {x,y}mm/mem,{x,y}mm,{x,y}mm,{x,y}mm */
+                                    /* vpcmov {x,y}mm,{x,y}mm/mem,{x,y}mm,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_imm8_ymm;
+
     case X86EMUL_OPC_XOP(09, 0x01): /* XOP Grp1 */
         switch ( modrm_reg & 7 )
         {
@@ -8204,6 +8279,61 @@  x86_emulate(
         }
         goto cannot_emulate;
 
+    case X86EMUL_OPC_XOP(09, 0x82): /* vfrczss xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x83): /* vfrczsd xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x80): /* vfrczps {x,y}mm/mem,{x,y}mm */
+    case X86EMUL_OPC_XOP(09, 0x81): /* vfrczpd {x,y}mm/mem,{x,y}mm */
+        host_and_vcpu_must_have(xop);
+        generate_exception_if(vex.w, EXC_UD);
+        goto simd_0f_ymm;
+
+    case X86EMUL_OPC_XOP(09, 0xc1): /* vphaddbw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc2): /* vphaddbd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc3): /* vphaddbq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc6): /* vphaddwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xc7): /* vphaddwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xcb): /* vphadddq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd1): /* vphaddubw xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd2): /* vphaddubd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd3): /* vphaddubq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd6): /* vphadduwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xd7): /* vphadduwq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xdb): /* vphaddudq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe2): /* vphsubwd xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe3): /* vphsubdq xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0xe1): /* vphsubbw xmm/m128,xmm */
+        generate_exception_if(vex.w, EXC_UD);
+        /* fall through */
+    case X86EMUL_OPC_XOP(09, 0x90): /* vprotb xmm/m128,xmm,xmm */
+                                    /* vprotb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x91): /* vprotw xmm/m128,xmm,xmm */
+                                    /* vprotw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x92): /* vprotd xmm/m128,xmm,xmm */
+                                    /* vprotd xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x93): /* vprotq xmm/m128,xmm,xmm */
+                                    /* vprotq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x94): /* vpshlb xmm/m128,xmm,xmm */
+                                    /* vpshlb xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x95): /* vpshlw xmm/m128,xmm,xmm */
+                                    /* vpshlw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x96): /* vpshld xmm/m128,xmm,xmm */
+                                    /* vpshld xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x97): /* vpshlq xmm/m128,xmm,xmm */
+                                    /* vpshlq xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x98): /* vpshab xmm/m128,xmm,xmm */
+                                    /* vpshab xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x99): /* vpshaw xmm/m128,xmm,xmm */
+                                    /* vpshaw xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9a): /* vpshad xmm/m128,xmm,xmm */
+                                    /* vpshad xmm,xmm/m128,xmm */
+    case X86EMUL_OPC_XOP(09, 0x9b): /* vpshaq xmm/m128,xmm,xmm */
+                                    /* vpshaq xmm,xmm/m128,xmm */
+        generate_exception_if(vex.l, EXC_UD);
+        host_and_vcpu_must_have(xop);
+        goto simd_0f_ymm;
+
     case X86EMUL_OPC_XOP(0a, 0x10): /* bextr imm,r/m,r */
     {
         uint8_t *buf = get_stub(stub);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -76,6 +76,7 @@ 
 #define cpu_has_cmp_legacy      boot_cpu_has(X86_FEATURE_CMP_LEGACY)
 #define cpu_has_svm             boot_cpu_has(X86_FEATURE_SVM)
 #define cpu_has_sse4a           boot_cpu_has(X86_FEATURE_SSE4A)
+#define cpu_has_xop             boot_cpu_has(X86_FEATURE_XOP)
 #define cpu_has_lwp             boot_cpu_has(X86_FEATURE_LWP)
 #define cpu_has_fma4            boot_cpu_has(X86_FEATURE_FMA4)
 #define cpu_has_tbm             boot_cpu_has(X86_FEATURE_TBM)