KVM: emulate: avoid per-byte copying in instruction fetches
[pandora-kernel.git] / arch / x86 / kvm / emulate.c
index e4e833d..02c668a 100644 (file)
 #define NoWrite     ((u64)1 << 45)  /* No writeback */
 #define SrcWrite    ((u64)1 << 46)  /* Write back src operand */
 #define NoMod      ((u64)1 << 47)  /* Mod field is ignored */
+#define Intercept   ((u64)1 << 48)  /* Has valid intercept field */
+#define CheckPerm   ((u64)1 << 49)  /* Has valid check_perm field */
 
 #define DstXacc     (DstAccLo | SrcAccHi | SrcWrite)
 
@@ -426,6 +428,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
                .modrm_reg  = ctxt->modrm_reg,
                .modrm_rm   = ctxt->modrm_rm,
                .src_val    = ctxt->src.val64,
+               .dst_val    = ctxt->dst.val64,
                .src_bytes  = ctxt->src.bytes,
                .dst_bytes  = ctxt->dst.bytes,
                .ad_bytes   = ctxt->ad_bytes,
@@ -511,12 +514,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
        return desc->g ? (limit << 12) | 0xfff : limit;
 }
 
-static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
-{
-       ctxt->has_seg_override = true;
-       ctxt->seg_override = seg;
-}
-
 static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
 {
        if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
@@ -525,14 +522,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
        return ctxt->ops->get_cached_segment_base(ctxt, seg);
 }
 
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
-{
-       if (!ctxt->has_seg_override)
-               return 0;
-
-       return ctxt->seg_override;
-}
-
 static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
                             u32 error, bool valid)
 {
@@ -716,68 +705,74 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
 }
 
 /*
- * Fetch the next byte of the instruction being emulated which is pointed to
- * by ctxt->_eip, then increment ctxt->_eip.
- *
- * Also prefetch the remaining bytes of the instruction without crossing page
+ * Prefetch the remaining bytes of the instruction without crossing page
  * boundary if they are not in fetch_cache yet.
  */
-static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
+static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
 {
        struct fetch_cache *fc = &ctxt->fetch;
        int rc;
        int size, cur_size;
+       unsigned long linear;
 
-       if (ctxt->_eip == fc->end) {
-               unsigned long linear;
-               struct segmented_address addr = { .seg = VCPU_SREG_CS,
-                                                 .ea  = ctxt->_eip };
-               cur_size = fc->end - fc->start;
-               size = min(15UL - cur_size,
-                          PAGE_SIZE - offset_in_page(ctxt->_eip));
-               rc = __linearize(ctxt, addr, size, false, true, &linear);
-               if (unlikely(rc != X86EMUL_CONTINUE))
-                       return rc;
-               rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
-                                     size, &ctxt->exception);
-               if (unlikely(rc != X86EMUL_CONTINUE))
-                       return rc;
-               fc->end += size;
-       }
-       *dest = fc->data[ctxt->_eip - fc->start];
-       ctxt->_eip++;
+       struct segmented_address addr = { .seg = VCPU_SREG_CS,
+                                         .ea  = fc->end };
+       cur_size = fc->end - fc->start;
+       size = min(15UL - cur_size,
+                  PAGE_SIZE - offset_in_page(fc->end));
+
+       /*
+        * One instruction can only straddle two pages,
+        * and one has been loaded at the beginning of
+        * x86_decode_insn.  So, if not enough bytes
+        * still, we must have hit the 15-byte boundary.
+        */
+       if (unlikely(size < op_size))
+               return X86EMUL_UNHANDLEABLE;
+       rc = __linearize(ctxt, addr, size, false, true, &linear);
+       if (unlikely(rc != X86EMUL_CONTINUE))
+               return rc;
+       rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
+                             size, &ctxt->exception);
+       if (unlikely(rc != X86EMUL_CONTINUE))
+               return rc;
+       fc->end += size;
        return X86EMUL_CONTINUE;
 }
 
-static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
-                        void *dest, unsigned size)
+static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
+                                              unsigned size)
 {
-       int rc;
-
-       /* x86 instructions are limited to 15 bytes. */
-       if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
-               return X86EMUL_UNHANDLEABLE;
-       while (size--) {
-               rc = do_insn_fetch_byte(ctxt, dest++);
-               if (rc != X86EMUL_CONTINUE)
-                       return rc;
-       }
-       return X86EMUL_CONTINUE;
+       /* We have to be careful about overflow! */
+       if (unlikely(ctxt->_eip > ctxt->fetch.end - size))
+               return __do_insn_fetch_bytes(ctxt, size);
+       else
+               return X86EMUL_CONTINUE;
 }
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _ctxt)                                       \
-({     unsigned long _x;                                               \
-       rc = do_insn_fetch(_ctxt, &_x, sizeof(_type));                  \
+({     _type _x;                                                       \
+       struct fetch_cache *_fc;                                        \
+                                                                       \
+       rc = do_insn_fetch_bytes(_ctxt, sizeof(_type));                 \
        if (rc != X86EMUL_CONTINUE)                                     \
                goto done;                                              \
-       (_type)_x;                                                      \
+       _fc = &ctxt->fetch;                                             \
+       _x = *(_type __aligned(1) *) &_fc->data[ctxt->_eip - _fc->start]; \
+       ctxt->_eip += sizeof(_type);                                    \
+       _x;                                                             \
 })
 
 #define insn_fetch_arr(_arr, _size, _ctxt)                             \
-({     rc = do_insn_fetch(_ctxt, _arr, (_size));                       \
+({                                                                     \
+       struct fetch_cache *_fc;                                        \
+       rc = do_insn_fetch_bytes(_ctxt, _size);                         \
        if (rc != X86EMUL_CONTINUE)                                     \
                goto done;                                              \
+       _fc = &ctxt->fetch;                                             \
+       memcpy(_arr, &_fc->data[ctxt->_eip - _fc->start], _size);       \
+       ctxt->_eip += (_size);                                          \
 })
 
 /*
@@ -1063,19 +1058,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                        struct operand *op)
 {
        u8 sib;
-       int index_reg = 0, base_reg = 0, scale;
+       int index_reg, base_reg, scale;
        int rc = X86EMUL_CONTINUE;
        ulong modrm_ea = 0;
 
-       if (ctxt->rex_prefix) {
-               ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1;  /* REX.R */
-               index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
-               ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
-       }
+       ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */
+       index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */
+       base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */
 
-       ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+       ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6;
        ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
-       ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+       ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07);
        ctxt->modrm_seg = VCPU_SREG_DS;
 
        if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) {
@@ -1093,7 +1086,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                if (ctxt->d & Mmx) {
                        op->type = OP_MM;
                        op->bytes = 8;
-                       op->addr.xmm = ctxt->modrm_rm & 7;
+                       op->addr.mm = ctxt->modrm_rm & 7;
                        return rc;
                }
                fetch_register_operand(op);
@@ -1190,6 +1183,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
                }
        }
        op->addr.mem.ea = modrm_ea;
+       if (ctxt->ad_bytes != 8)
+               ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+
 done:
        return rc;
 }
@@ -1220,12 +1216,14 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
        long sv = 0, mask;
 
        if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
-               mask = ~(ctxt->dst.bytes * 8 - 1);
+               mask = ~((long)ctxt->dst.bytes * 8 - 1);
 
                if (ctxt->src.bytes == 2)
                        sv = (s16)ctxt->src.val & (s16)mask;
                else if (ctxt->src.bytes == 4)
                        sv = (s32)ctxt->src.val & (s32)mask;
+               else
+                       sv = (s64)ctxt->src.val & (s64)mask;
 
                ctxt->dst.addr.mem.ea += (sv >> 3);
        }
@@ -1358,17 +1356,19 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
                                     u16 selector, struct desc_ptr *dt)
 {
        const struct x86_emulate_ops *ops = ctxt->ops;
+       u32 base3 = 0;
 
        if (selector & 1 << 2) {
                struct desc_struct desc;
                u16 sel;
 
                memset (dt, 0, sizeof *dt);
-               if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
+               if (!ops->get_segment(ctxt, &sel, &desc, &base3,
+                                     VCPU_SREG_LDTR))
                        return;
 
                dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
-               dt->address = get_desc_base(&desc);
+               dt->address = get_desc_base(&desc) | ((u64)base3 << 32);
        } else
                ops->get_gdt(ctxt, dt);
 }
@@ -1422,6 +1422,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
        ulong desc_addr;
        int ret;
        u16 dummy;
+       u32 base3 = 0;
 
        memset(&seg_desc, 0, sizeof seg_desc);
 
@@ -1538,9 +1539,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
                ret = write_segment_descriptor(ctxt, selector, &seg_desc);
                if (ret != X86EMUL_CONTINUE)
                        return ret;
+       } else if (ctxt->mode == X86EMUL_MODE_PROT64) {
+               ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
+                               sizeof(base3), &ctxt->exception);
+               if (ret != X86EMUL_CONTINUE)
+                       return ret;
        }
 load:
-       ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
+       ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
        return X86EMUL_CONTINUE;
 exception:
        emulate_exception(ctxt, err_vec, err_code, true);
@@ -1575,34 +1581,28 @@ static void write_register_operand(struct operand *op)
 
 static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
 {
-       int rc;
-
        switch (op->type) {
        case OP_REG:
                write_register_operand(op);
                break;
        case OP_MEM:
                if (ctxt->lock_prefix)
-                       rc = segmented_cmpxchg(ctxt,
+                       return segmented_cmpxchg(ctxt,
+                                                op->addr.mem,
+                                                &op->orig_val,
+                                                &op->val,
+                                                op->bytes);
+               else
+                       return segmented_write(ctxt,
                                               op->addr.mem,
-                                              &op->orig_val,
                                               &op->val,
                                               op->bytes);
-               else
-                       rc = segmented_write(ctxt,
-                                            op->addr.mem,
-                                            &op->val,
-                                            op->bytes);
-               if (rc != X86EMUL_CONTINUE)
-                       return rc;
                break;
        case OP_MEM_STR:
-               rc = segmented_write(ctxt,
-                               op->addr.mem,
-                               op->data,
-                               op->bytes * op->count);
-               if (rc != X86EMUL_CONTINUE)
-                       return rc;
+               return segmented_write(ctxt,
+                                      op->addr.mem,
+                                      op->data,
+                                      op->bytes * op->count);
                break;
        case OP_XMM:
                write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
@@ -1754,6 +1754,9 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
        if (rc != X86EMUL_CONTINUE)
                return rc;
 
+       if (ctxt->modrm_reg == VCPU_SREG_SS)
+               ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
        rc = load_segment_descriptor(ctxt, (u16)selector, seg);
        return rc;
 }
@@ -1991,6 +1994,9 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
        u64 old = ctxt->dst.orig_val64;
 
+       if (ctxt->dst.bytes == 16)
+               return X86EMUL_UNHANDLEABLE;
+
        if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
            ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
                *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
@@ -2017,6 +2023,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 {
        int rc;
        unsigned long cs;
+       int cpl = ctxt->ops->cpl(ctxt);
 
        rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
@@ -2026,6 +2033,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
        rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
        if (rc != X86EMUL_CONTINUE)
                return rc;
+       /* Outer-privilege level return is not implemented */
+       if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
+               return X86EMUL_UNHANDLEABLE;
        rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
        return rc;
 }
@@ -2044,8 +2054,10 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt)
 static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 {
        /* Save real source value, then compare EAX against destination. */
+       ctxt->dst.orig_val = ctxt->dst.val;
+       ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX);
        ctxt->src.orig_val = ctxt->src.val;
-       ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX);
+       ctxt->src.val = ctxt->dst.orig_val;
        fastop(ctxt, em_cmp);
 
        if (ctxt->eflags & EFLG_ZF) {
@@ -2055,6 +2067,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
                /* Failure: write the value we saw to EAX. */
                ctxt->dst.type = OP_REG;
                ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+               ctxt->dst.val = ctxt->dst.orig_val;
        }
        return X86EMUL_CONTINUE;
 }
@@ -2964,7 +2977,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
 
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
-       memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
+       memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr));
        return X86EMUL_CONTINUE;
 }
 
@@ -3221,7 +3234,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
 
 static int em_smsw(struct x86_emulate_ctxt *ctxt)
 {
-       ctxt->dst.bytes = 2;
+       if (ctxt->dst.type == OP_MEM)
+               ctxt->dst.bytes = 2;
        ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
        return X86EMUL_CONTINUE;
 }
@@ -3496,7 +3510,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
        u64 rcx = reg_read(ctxt, VCPU_REGS_RCX);
 
        if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
-           (rcx > 3))
+           ctxt->ops->check_pmc(ctxt, rcx))
                return emulate_gp(ctxt, 0);
 
        return X86EMUL_CONTINUE;
@@ -3521,9 +3535,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 }
 
 #define D(_y) { .flags = (_y) }
-#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
-#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
-                     .check_perm = (_p) }
+#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i }
+#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \
+                     .intercept = x86_intercept_##_i, .check_perm = (_p) }
 #define N    D(NotImpl)
 #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
@@ -3532,10 +3546,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
 #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
 #define II(_f, _e, _i) \
-       { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
+       { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i }
 #define IIP(_f, _e, _i, _p) \
-       { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
-         .check_perm = (_p) }
+       { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \
+         .intercept = x86_intercept_##_i, .check_perm = (_p) }
 #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
 
 #define D2bv(_f)      D((_f) | ByteOp), D(_f)
@@ -3634,8 +3648,8 @@ static const struct opcode group6[] = {
 };
 
 static const struct group_dual group7 = { {
-       II(Mov | DstMem | Priv,                 em_sgdt, sgdt),
-       II(Mov | DstMem | Priv,                 em_sidt, sidt),
+       II(Mov | DstMem,                        em_sgdt, sgdt),
+       II(Mov | DstMem,                        em_sidt, sidt),
        II(SrcMem | Priv,                       em_lgdt, lgdt),
        II(SrcMem | Priv,                       em_lidt, lidt),
        II(SrcNone | DstMem | Mov,              em_smsw, smsw), N,
@@ -3899,7 +3913,7 @@ static const struct opcode twobyte_table[256] = {
        N, N,
        N, N, N, N, N, N, N, N,
        /* 0x40 - 0x4F */
-       X16(D(DstReg | SrcMem | ModRM | Mov)),
+       X16(D(DstReg | SrcMem | ModRM)),
        /* 0x50 - 0x5F */
        N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
        /* 0x60 - 0x6F */
@@ -4061,12 +4075,12 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
        mem_common:
                *op = ctxt->memop;
                ctxt->memopp = op;
-               if ((ctxt->d & BitOp) && op == &ctxt->dst)
+               if (ctxt->d & BitOp)
                        fetch_bit_operand(ctxt);
                op->orig_val = op->val;
                break;
        case OpMem64:
-               ctxt->memop.bytes = 8;
+               ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8;
                goto mem_common;
        case OpAcc:
                op->type = OP_REG;
@@ -4150,7 +4164,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
                op->addr.mem.ea =
                        register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
-               op->addr.mem.seg = seg_override(ctxt);
+               op->addr.mem.seg = ctxt->seg_override;
                op->val = 0;
                op->count = 1;
                break;
@@ -4161,7 +4175,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
                        register_address(ctxt,
                                reg_read(ctxt, VCPU_REGS_RBX) +
                                (reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
-               op->addr.mem.seg = seg_override(ctxt);
+               op->addr.mem.seg = ctxt->seg_override;
                op->val = 0;
                break;
        case OpImmFAddr:
@@ -4208,6 +4222,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
        int mode = ctxt->mode;
        int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
        bool op_prefix = false;
+       bool has_seg_override = false;
        struct opcode opcode;
 
        ctxt->memop.type = OP_NONE;
@@ -4218,6 +4233,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
        ctxt->opcode_len = 1;
        if (insn_len > 0)
                memcpy(ctxt->fetch.data, insn, insn_len);
+       else {
+               rc = __do_insn_fetch_bytes(ctxt, 1);
+               if (rc != X86EMUL_CONTINUE)
+                       return rc;
+       }
 
        switch (mode) {
        case X86EMUL_MODE_REAL:
@@ -4261,11 +4281,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
                case 0x2e:      /* CS override */
                case 0x36:      /* SS override */
                case 0x3e:      /* DS override */
-                       set_seg_override(ctxt, (ctxt->b >> 3) & 3);
+                       has_seg_override = true;
+                       ctxt->seg_override = (ctxt->b >> 3) & 3;
                        break;
                case 0x64:      /* FS override */
                case 0x65:      /* GS override */
-                       set_seg_override(ctxt, ctxt->b & 7);
+                       has_seg_override = true;
+                       ctxt->seg_override = ctxt->b & 7;
                        break;
                case 0x40 ... 0x4f: /* REX */
                        if (mode != X86EMUL_MODE_PROT64)
@@ -4314,6 +4336,13 @@ done_prefixes:
        if (ctxt->d & ModRM)
                ctxt->modrm = insn_fetch(u8, ctxt);
 
+       /* vex-prefix instructions are not implemented */
+       if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
+           (mode == X86EMUL_MODE_PROT64 ||
+           (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) {
+               ctxt->d = NotImpl;
+       }
+
        while (ctxt->d & GroupMask) {
                switch (ctxt->d & GroupMask) {
                case Group:
@@ -4356,49 +4385,59 @@ done_prefixes:
                ctxt->d |= opcode.flags;
        }
 
-       ctxt->execute = opcode.u.execute;
-       ctxt->check_perm = opcode.check_perm;
-       ctxt->intercept = opcode.intercept;
-
        /* Unrecognised? */
-       if (ctxt->d == 0 || (ctxt->d & NotImpl))
+       if (ctxt->d == 0)
                return EMULATION_FAILED;
 
-       if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
-               return EMULATION_FAILED;
+       ctxt->execute = opcode.u.execute;
 
-       if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
-               ctxt->op_bytes = 8;
+       if (unlikely(ctxt->d &
+                    (NotImpl|EmulateOnUD|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
+               /*
+                * These are copied unconditionally here, and checked unconditionally
+                * in x86_emulate_insn.
+                */
+               ctxt->check_perm = opcode.check_perm;
+               ctxt->intercept = opcode.intercept;
+
+               if (ctxt->d & NotImpl)
+                       return EMULATION_FAILED;
+
+               if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
+                       return EMULATION_FAILED;
 
-       if (ctxt->d & Op3264) {
-               if (mode == X86EMUL_MODE_PROT64)
+               if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
                        ctxt->op_bytes = 8;
-               else
-                       ctxt->op_bytes = 4;
-       }
 
-       if (ctxt->d & Sse)
-               ctxt->op_bytes = 16;
-       else if (ctxt->d & Mmx)
-               ctxt->op_bytes = 8;
+               if (ctxt->d & Op3264) {
+                       if (mode == X86EMUL_MODE_PROT64)
+                               ctxt->op_bytes = 8;
+                       else
+                               ctxt->op_bytes = 4;
+               }
+
+               if (ctxt->d & Sse)
+                       ctxt->op_bytes = 16;
+               else if (ctxt->d & Mmx)
+                       ctxt->op_bytes = 8;
+       }
 
        /* ModRM and SIB bytes. */
        if (ctxt->d & ModRM) {
                rc = decode_modrm(ctxt, &ctxt->memop);
-               if (!ctxt->has_seg_override)
-                       set_seg_override(ctxt, ctxt->modrm_seg);
+               if (!has_seg_override) {
+                       has_seg_override = true;
+                       ctxt->seg_override = ctxt->modrm_seg;
+               }
        } else if (ctxt->d & MemAbs)
                rc = decode_abs(ctxt, &ctxt->memop);
        if (rc != X86EMUL_CONTINUE)
                goto done;
 
-       if (!ctxt->has_seg_override)
-               set_seg_override(ctxt, VCPU_SREG_DS);
-
-       ctxt->memop.addr.mem.seg = seg_override(ctxt);
+       if (!has_seg_override)
+               ctxt->seg_override = VCPU_SREG_DS;
 
-       if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
-               ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
+       ctxt->memop.addr.mem.seg = ctxt->seg_override;
 
        /*
         * Decode and fetch the source operand: register, memory
@@ -4420,7 +4459,7 @@ done_prefixes:
        rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
 
 done:
-       if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
+       if (ctxt->rip_relative)
                ctxt->memopp->addr.mem.ea += ctxt->_eip;
 
        return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
@@ -4495,6 +4534,16 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
        return X86EMUL_CONTINUE;
 }
 
+void init_decode_cache(struct x86_emulate_ctxt *ctxt)
+{
+       memset(&ctxt->rip_relative, 0,
+              (void *)&ctxt->modrm - (void *)&ctxt->rip_relative);
+
+       ctxt->io_read.pos = 0;
+       ctxt->io_read.end = 0;
+       ctxt->mem_read.end = 0;
+}
+
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
        const struct x86_emulate_ops *ops = ctxt->ops;
@@ -4503,12 +4552,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
        ctxt->mem_read.pos = 0;
 
-       if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
-                       (ctxt->d & Undefined)) {
-               rc = emulate_ud(ctxt);
-               goto done;
-       }
-
        /* LOCK prefix is allowed only with some instructions */
        if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
                rc = emulate_ud(ctxt);
@@ -4520,69 +4563,78 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
                goto done;
        }
 
-       if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
-           || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
-               rc = emulate_ud(ctxt);
-               goto done;
-       }
-
-       if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
-               rc = emulate_nm(ctxt);
-               goto done;
-       }
+       if (unlikely(ctxt->d &
+                    (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
+               if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+                               (ctxt->d & Undefined)) {
+                       rc = emulate_ud(ctxt);
+                       goto done;
+               }
 
-       if (ctxt->d & Mmx) {
-               rc = flush_pending_x87_faults(ctxt);
-               if (rc != X86EMUL_CONTINUE)
+               if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+                   || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+                       rc = emulate_ud(ctxt);
                        goto done;
-               /*
-                * Now that we know the fpu is exception safe, we can fetch
-                * operands from it.
-                */
-               fetch_possible_mmx_operand(ctxt, &ctxt->src);
-               fetch_possible_mmx_operand(ctxt, &ctxt->src2);
-               if (!(ctxt->d & Mov))
-                       fetch_possible_mmx_operand(ctxt, &ctxt->dst);
-       }
+               }
 
-       if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
-               rc = emulator_check_intercept(ctxt, ctxt->intercept,
-                                             X86_ICPT_PRE_EXCEPT);
-               if (rc != X86EMUL_CONTINUE)
+               if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+                       rc = emulate_nm(ctxt);
                        goto done;
-       }
+               }
 
-       /* Privileged instruction can be executed only in CPL=0 */
-       if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
-               rc = emulate_gp(ctxt, 0);
-               goto done;
-       }
+               if (ctxt->d & Mmx) {
+                       rc = flush_pending_x87_faults(ctxt);
+                       if (rc != X86EMUL_CONTINUE)
+                               goto done;
+                       /*
+                        * Now that we know the fpu is exception safe, we can fetch
+                        * operands from it.
+                        */
+                       fetch_possible_mmx_operand(ctxt, &ctxt->src);
+                       fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+                       if (!(ctxt->d & Mov))
+                               fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+               }
 
-       /* Instruction can only be executed in protected mode */
-       if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
-               rc = emulate_ud(ctxt);
-               goto done;
-       }
+               if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+                       rc = emulator_check_intercept(ctxt, ctxt->intercept,
+                                                     X86_ICPT_PRE_EXCEPT);
+                       if (rc != X86EMUL_CONTINUE)
+                               goto done;
+               }
 
-       /* Do instruction specific permission checks */
-       if (ctxt->check_perm) {
-               rc = ctxt->check_perm(ctxt);
-               if (rc != X86EMUL_CONTINUE)
+               /* Privileged instruction can be executed only in CPL=0 */
+               if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
+                       rc = emulate_gp(ctxt, 0);
                        goto done;
-       }
+               }
 
-       if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
-               rc = emulator_check_intercept(ctxt, ctxt->intercept,
-                                             X86_ICPT_POST_EXCEPT);
-               if (rc != X86EMUL_CONTINUE)
+               /* Instruction can only be executed in protected mode */
+               if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+                       rc = emulate_ud(ctxt);
                        goto done;
-       }
+               }
 
-       if (ctxt->rep_prefix && (ctxt->d & String)) {
-               /* All REP prefixes have the same first termination condition */
-               if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
-                       ctxt->eip = ctxt->_eip;
-                       goto done;
+               /* Do instruction specific permission checks */
+               if (ctxt->d & CheckPerm) {
+                       rc = ctxt->check_perm(ctxt);
+                       if (rc != X86EMUL_CONTINUE)
+                               goto done;
+               }
+
+               if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+                       rc = emulator_check_intercept(ctxt, ctxt->intercept,
+                                                     X86_ICPT_POST_EXCEPT);
+                       if (rc != X86EMUL_CONTINUE)
+                               goto done;
+               }
+
+               if (ctxt->rep_prefix && (ctxt->d & String)) {
+                       /* All REP prefixes have the same first termination condition */
+                       if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+                               ctxt->eip = ctxt->_eip;
+                               goto done;
+                       }
                }
        }
 
@@ -4616,7 +4668,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
-       if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+       if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
                rc = emulator_check_intercept(ctxt, ctxt->intercept,
                                              X86_ICPT_POST_MEMACCESS);
                if (rc != X86EMUL_CONTINUE)
@@ -4657,8 +4709,9 @@ special_insn:
                break;
        case 0x90 ... 0x97: /* nop / xchg reg, rax */
                if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX))
-                       break;
-               rc = em_xchg(ctxt);
+                       ctxt->dst.type = OP_NONE;
+               else
+                       rc = em_xchg(ctxt);
                break;
        case 0x98: /* cbw/cwde/cdqe */
                switch (ctxt->op_bytes) {
@@ -4709,17 +4762,17 @@ special_insn:
                goto done;
 
 writeback:
-       if (!(ctxt->d & NoWrite)) {
-               rc = writeback(ctxt, &ctxt->dst);
-               if (rc != X86EMUL_CONTINUE)
-                       goto done;
-       }
        if (ctxt->d & SrcWrite) {
                BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR);
                rc = writeback(ctxt, &ctxt->src);
                if (rc != X86EMUL_CONTINUE)
                        goto done;
        }
+       if (!(ctxt->d & NoWrite)) {
+               rc = writeback(ctxt, &ctxt->dst);
+               if (rc != X86EMUL_CONTINUE)
+                       goto done;
+       }
 
        /*
         * restore dst type in case the decoding will be reused
@@ -4793,8 +4846,10 @@ twobyte_insn:
                ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
                break;
        case 0x40 ... 0x4f:     /* cmov */
-               ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
-               if (!test_cc(ctxt->b, ctxt->eflags))
+               if (test_cc(ctxt->b, ctxt->eflags))
+                       ctxt->dst.val = ctxt->src.val;
+               else if (ctxt->mode != X86EMUL_MODE_PROT64 ||
+                        ctxt->op_bytes != 4)
                        ctxt->dst.type = OP_NONE; /* no writeback */
                break;
        case 0x80 ... 0x8f: /* jnz rel, etc*/
@@ -4818,8 +4873,8 @@ twobyte_insn:
                break;
        case 0xc3:              /* movnti */
                ctxt->dst.bytes = ctxt->op_bytes;
-               ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
-                                                       (u64) ctxt->src.val;
+               ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val :
+                                                       (u32) ctxt->src.val;
                break;
        default:
                goto cannot_emulate;