Merge branch 'drm-next' of git://people.freedesktop.org/~airlied/linux
[pandora-kernel.git] / drivers / gpu / drm / radeon / evergreen_cs.c
index ee4cff5..99fb132 100644 (file)
@@ -36,9 +36,6 @@
 
 int r600_dma_cs_next_reloc(struct radeon_cs_parser *p,
                           struct radeon_cs_reloc **cs_reloc);
-static int evergreen_cs_packet_next_reloc(struct radeon_cs_parser *p,
-                                         struct radeon_cs_reloc **cs_reloc);
-
 struct evergreen_cs_track {
        u32                     group_size;
        u32                     nbanks;
@@ -1009,223 +1006,35 @@ static int evergreen_cs_track_check(struct radeon_cs_parser *p)
 }
 
 /**
- * evergreen_cs_packet_parse() - parse cp packet and point ib index to next packet
- * @parser:    parser structure holding parsing context.
- * @pkt:       where to store packet informations
- *
- * Assume that chunk_ib_index is properly set. Will return -EINVAL
- * if packet is bigger than remaining ib size. or if packets is unknown.
- **/
-static int evergreen_cs_packet_parse(struct radeon_cs_parser *p,
-                             struct radeon_cs_packet *pkt,
-                             unsigned idx)
-{
-       struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx];
-       uint32_t header;
-
-       if (idx >= ib_chunk->length_dw) {
-               DRM_ERROR("Can not parse packet at %d after CS end %d !\n",
-                         idx, ib_chunk->length_dw);
-               return -EINVAL;
-       }
-       header = radeon_get_ib_value(p, idx);
-       pkt->idx = idx;
-       pkt->type = CP_PACKET_GET_TYPE(header);
-       pkt->count = CP_PACKET_GET_COUNT(header);
-       pkt->one_reg_wr = 0;
-       switch (pkt->type) {
-       case PACKET_TYPE0:
-               pkt->reg = CP_PACKET0_GET_REG(header);
-               break;
-       case PACKET_TYPE3:
-               pkt->opcode = CP_PACKET3_GET_OPCODE(header);
-               break;
-       case PACKET_TYPE2:
-               pkt->count = -1;
-               break;
-       default:
-               DRM_ERROR("Unknown packet type %d at %d !\n", pkt->type, idx);
-               return -EINVAL;
-       }
-       if ((pkt->count + 1 + pkt->idx) >= ib_chunk->length_dw) {
-               DRM_ERROR("Packet (%d:%d:%d) end after CS buffer (%d) !\n",
-                         pkt->idx, pkt->type, pkt->count, ib_chunk->length_dw);
-               return -EINVAL;
-       }
-       return 0;
-}
-
-/**
- * evergreen_cs_packet_next_reloc() - parse next packet which should be reloc packet3
+ * evergreen_cs_packet_parse_vline() - parse userspace VLINE packet
  * @parser:            parser structure holding parsing context.
- * @data:              pointer to relocation data
- * @offset_start:      starting offset
- * @offset_mask:       offset mask (to align start offset on)
- * @reloc:             reloc informations
  *
- * Check next packet is relocation packet3, do bo validation and compute
- * GPU offset using the provided start.
- **/
-static int evergreen_cs_packet_next_reloc(struct radeon_cs_parser *p,
-                                         struct radeon_cs_reloc **cs_reloc)
-{
-       struct radeon_cs_chunk *relocs_chunk;
-       struct radeon_cs_packet p3reloc;
-       unsigned idx;
-       int r;
-
-       if (p->chunk_relocs_idx == -1) {
-               DRM_ERROR("No relocation chunk !\n");
-               return -EINVAL;
-       }
-       *cs_reloc = NULL;
-       relocs_chunk = &p->chunks[p->chunk_relocs_idx];
-       r = evergreen_cs_packet_parse(p, &p3reloc, p->idx);
-       if (r) {
-               return r;
-       }
-       p->idx += p3reloc.count + 2;
-       if (p3reloc.type != PACKET_TYPE3 || p3reloc.opcode != PACKET3_NOP) {
-               DRM_ERROR("No packet3 for relocation for packet at %d.\n",
-                         p3reloc.idx);
-               return -EINVAL;
-       }
-       idx = radeon_get_ib_value(p, p3reloc.idx + 1);
-       if (idx >= relocs_chunk->length_dw) {
-               DRM_ERROR("Relocs at %d after relocations chunk end %d !\n",
-                         idx, relocs_chunk->length_dw);
-               return -EINVAL;
-       }
-       /* FIXME: we assume reloc size is 4 dwords */
-       *cs_reloc = p->relocs_ptr[(idx / 4)];
-       return 0;
-}
-
-/**
- * evergreen_cs_packet_next_is_pkt3_nop() - test if the next packet is NOP
- * @p:         structure holding the parser context.
- *
- * Check if the next packet is a relocation packet3.
- **/
-static bool evergreen_cs_packet_next_is_pkt3_nop(struct radeon_cs_parser *p)
-{
-       struct radeon_cs_packet p3reloc;
-       int r;
-
-       r = evergreen_cs_packet_parse(p, &p3reloc, p->idx);
-       if (r) {
-               return false;
-       }
-       if (p3reloc.type != PACKET_TYPE3 || p3reloc.opcode != PACKET3_NOP) {
-               return false;
-       }
-       return true;
-}
-
-/**
- * evergreen_cs_packet_next_vline() - parse userspace VLINE packet
- * @parser:            parser structure holding parsing context.
- *
- * Userspace sends a special sequence for VLINE waits.
- * PACKET0 - VLINE_START_END + value
- * PACKET3 - WAIT_REG_MEM poll vline status reg
- * RELOC (P3) - crtc_id in reloc.
- *
- * This function parses this and relocates the VLINE START END
- * and WAIT_REG_MEM packets to the correct crtc.
- * It also detects a switched off crtc and nulls out the
- * wait in that case.
+ * This is an Evergreen(+)-specific function for parsing VLINE packets.
+ * Real work is done by r600_cs_common_vline_parse function.
+ * Here we just set up ASIC-specific register table and call
+ * the common implementation function.
  */
 static int evergreen_cs_packet_parse_vline(struct radeon_cs_parser *p)
 {
-       struct drm_mode_object *obj;
-       struct drm_crtc *crtc;
-       struct radeon_crtc *radeon_crtc;
-       struct radeon_cs_packet p3reloc, wait_reg_mem;
-       int crtc_id;
-       int r;
-       uint32_t header, h_idx, reg, wait_reg_mem_info;
-       volatile uint32_t *ib;
-
-       ib = p->ib.ptr;
-
-       /* parse the WAIT_REG_MEM */
-       r = evergreen_cs_packet_parse(p, &wait_reg_mem, p->idx);
-       if (r)
-               return r;
-
-       /* check its a WAIT_REG_MEM */
-       if (wait_reg_mem.type != PACKET_TYPE3 ||
-           wait_reg_mem.opcode != PACKET3_WAIT_REG_MEM) {
-               DRM_ERROR("vline wait missing WAIT_REG_MEM segment\n");
-               return -EINVAL;
-       }
-
-       wait_reg_mem_info = radeon_get_ib_value(p, wait_reg_mem.idx + 1);
-       /* bit 4 is reg (0) or mem (1) */
-       if (wait_reg_mem_info & 0x10) {
-               DRM_ERROR("vline WAIT_REG_MEM waiting on MEM rather than REG\n");
-               return -EINVAL;
-       }
-       /* waiting for value to be equal */
-       if ((wait_reg_mem_info & 0x7) != 0x3) {
-               DRM_ERROR("vline WAIT_REG_MEM function not equal\n");
-               return -EINVAL;
-       }
-       if ((radeon_get_ib_value(p, wait_reg_mem.idx + 2) << 2) != EVERGREEN_VLINE_STATUS) {
-               DRM_ERROR("vline WAIT_REG_MEM bad reg\n");
-               return -EINVAL;
-       }
-
-       if (radeon_get_ib_value(p, wait_reg_mem.idx + 5) != EVERGREEN_VLINE_STAT) {
-               DRM_ERROR("vline WAIT_REG_MEM bad bit mask\n");
-               return -EINVAL;
-       }
-
-       /* jump over the NOP */
-       r = evergreen_cs_packet_parse(p, &p3reloc, p->idx + wait_reg_mem.count + 2);
-       if (r)
-               return r;
-
-       h_idx = p->idx - 2;
-       p->idx += wait_reg_mem.count + 2;
-       p->idx += p3reloc.count + 2;
 
-       header = radeon_get_ib_value(p, h_idx);
-       crtc_id = radeon_get_ib_value(p, h_idx + 2 + 7 + 1);
-       reg = CP_PACKET0_GET_REG(header);
-       obj = drm_mode_object_find(p->rdev->ddev, crtc_id, DRM_MODE_OBJECT_CRTC);
-       if (!obj) {
-               DRM_ERROR("cannot find crtc %d\n", crtc_id);
-               return -EINVAL;
-       }
-       crtc = obj_to_crtc(obj);
-       radeon_crtc = to_radeon_crtc(crtc);
-       crtc_id = radeon_crtc->crtc_id;
-
-       if (!crtc->enabled) {
-               /* if the CRTC isn't enabled - we need to nop out the WAIT_REG_MEM */
-               ib[h_idx + 2] = PACKET2(0);
-               ib[h_idx + 3] = PACKET2(0);
-               ib[h_idx + 4] = PACKET2(0);
-               ib[h_idx + 5] = PACKET2(0);
-               ib[h_idx + 6] = PACKET2(0);
-               ib[h_idx + 7] = PACKET2(0);
-               ib[h_idx + 8] = PACKET2(0);
-       } else {
-               switch (reg) {
-               case EVERGREEN_VLINE_START_END:
-                       header &= ~R600_CP_PACKET0_REG_MASK;
-                       header |= (EVERGREEN_VLINE_START_END + radeon_crtc->crtc_offset) >> 2;
-                       ib[h_idx] = header;
-                       ib[h_idx + 4] = (EVERGREEN_VLINE_STATUS + radeon_crtc->crtc_offset) >> 2;
-                       break;
-               default:
-                       DRM_ERROR("unknown crtc reloc\n");
-                       return -EINVAL;
-               }
-       }
-       return 0;
+       static uint32_t vline_start_end[6] = {
+               EVERGREEN_VLINE_START_END + EVERGREEN_CRTC0_REGISTER_OFFSET,
+               EVERGREEN_VLINE_START_END + EVERGREEN_CRTC1_REGISTER_OFFSET,
+               EVERGREEN_VLINE_START_END + EVERGREEN_CRTC2_REGISTER_OFFSET,
+               EVERGREEN_VLINE_START_END + EVERGREEN_CRTC3_REGISTER_OFFSET,
+               EVERGREEN_VLINE_START_END + EVERGREEN_CRTC4_REGISTER_OFFSET,
+               EVERGREEN_VLINE_START_END + EVERGREEN_CRTC5_REGISTER_OFFSET
+       };
+       static uint32_t vline_status[6] = {
+               EVERGREEN_VLINE_STATUS + EVERGREEN_CRTC0_REGISTER_OFFSET,
+               EVERGREEN_VLINE_STATUS + EVERGREEN_CRTC1_REGISTER_OFFSET,
+               EVERGREEN_VLINE_STATUS + EVERGREEN_CRTC2_REGISTER_OFFSET,
+               EVERGREEN_VLINE_STATUS + EVERGREEN_CRTC3_REGISTER_OFFSET,
+               EVERGREEN_VLINE_STATUS + EVERGREEN_CRTC4_REGISTER_OFFSET,
+               EVERGREEN_VLINE_STATUS + EVERGREEN_CRTC5_REGISTER_OFFSET
+       };
+
+       return r600_cs_common_vline_parse(p, vline_start_end, vline_status);
 }
 
 static int evergreen_packet0_check(struct radeon_cs_parser *p,
@@ -1347,7 +1156,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case SQ_LSTMP_RING_BASE:
        case SQ_PSTMP_RING_BASE:
        case SQ_VSTMP_RING_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1376,7 +1185,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case DB_Z_INFO:
                track->db_z_info = radeon_get_ib_value(p, idx);
                if (!(p->cs_flags & RADEON_CS_KEEP_TILING_FLAGS)) {
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                                "0x%04X\n", reg);
@@ -1418,7 +1227,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                track->db_dirty = true;
                break;
        case DB_Z_READ_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1430,7 +1239,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                track->db_dirty = true;
                break;
        case DB_Z_WRITE_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1442,7 +1251,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                track->db_dirty = true;
                break;
        case DB_STENCIL_READ_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1454,7 +1263,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                track->db_dirty = true;
                break;
        case DB_STENCIL_WRITE_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1477,7 +1286,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case VGT_STRMOUT_BUFFER_BASE_1:
        case VGT_STRMOUT_BUFFER_BASE_2:
        case VGT_STRMOUT_BUFFER_BASE_3:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1499,7 +1308,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                track->streamout_dirty = true;
                break;
        case CP_COHER_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "missing reloc for CP_COHER_BASE "
                                        "0x%04X\n", reg);
@@ -1563,7 +1372,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                tmp = (reg - CB_COLOR0_INFO) / 0x3c;
                track->cb_color_info[tmp] = radeon_get_ib_value(p, idx);
                if (!(p->cs_flags & RADEON_CS_KEEP_TILING_FLAGS)) {
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                                "0x%04X\n", reg);
@@ -1581,7 +1390,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                tmp = ((reg - CB_COLOR8_INFO) / 0x1c) + 8;
                track->cb_color_info[tmp] = radeon_get_ib_value(p, idx);
                if (!(p->cs_flags & RADEON_CS_KEEP_TILING_FLAGS)) {
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                                "0x%04X\n", reg);
@@ -1642,7 +1451,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case CB_COLOR5_ATTRIB:
        case CB_COLOR6_ATTRIB:
        case CB_COLOR7_ATTRIB:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1670,7 +1479,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case CB_COLOR9_ATTRIB:
        case CB_COLOR10_ATTRIB:
        case CB_COLOR11_ATTRIB:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1703,7 +1512,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case CB_COLOR6_FMASK:
        case CB_COLOR7_FMASK:
                tmp = (reg - CB_COLOR0_FMASK) / 0x3c;
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_err(p->dev, "bad SET_CONTEXT_REG 0x%04X\n", reg);
                        return -EINVAL;
@@ -1720,7 +1529,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case CB_COLOR6_CMASK:
        case CB_COLOR7_CMASK:
                tmp = (reg - CB_COLOR0_CMASK) / 0x3c;
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_err(p->dev, "bad SET_CONTEXT_REG 0x%04X\n", reg);
                        return -EINVAL;
@@ -1758,7 +1567,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case CB_COLOR5_BASE:
        case CB_COLOR6_BASE:
        case CB_COLOR7_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1774,7 +1583,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case CB_COLOR9_BASE:
        case CB_COLOR10_BASE:
        case CB_COLOR11_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1787,7 +1596,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                track->cb_dirty = true;
                break;
        case DB_HTILE_DATA_BASE:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1905,7 +1714,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
        case SQ_ALU_CONST_CACHE_LS_13:
        case SQ_ALU_CONST_CACHE_LS_14:
        case SQ_ALU_CONST_CACHE_LS_15:
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -1919,7 +1728,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                                 "0x%04X\n", reg);
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONFIG_REG "
                                        "0x%04X\n", reg);
@@ -1933,7 +1742,7 @@ static int evergreen_cs_check_reg(struct radeon_cs_parser *p, u32 reg, u32 idx)
                                 "0x%04X\n", reg);
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        dev_warn(p->dev, "bad SET_CONTEXT_REG "
                                        "0x%04X\n", reg);
@@ -2018,7 +1827,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        return -EINVAL;
                }
 
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad SET PREDICATION\n");
                        return -EINVAL;
@@ -2064,7 +1873,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        DRM_ERROR("bad INDEX_BASE\n");
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad INDEX_BASE\n");
                        return -EINVAL;
@@ -2091,7 +1900,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        DRM_ERROR("bad DRAW_INDEX\n");
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad DRAW_INDEX\n");
                        return -EINVAL;
@@ -2119,7 +1928,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        DRM_ERROR("bad DRAW_INDEX_2\n");
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad DRAW_INDEX_2\n");
                        return -EINVAL;
@@ -2210,7 +2019,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        DRM_ERROR("bad DISPATCH_INDIRECT\n");
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad DISPATCH_INDIRECT\n");
                        return -EINVAL;
@@ -2231,7 +2040,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                if (idx_value & 0x10) {
                        uint64_t offset;
 
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                DRM_ERROR("bad WAIT_REG_MEM\n");
                                return -EINVAL;
@@ -2243,6 +2052,9 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
 
                        ib[idx+1] = (ib[idx+1] & 0x3) | (offset & 0xfffffffc);
                        ib[idx+2] = upper_32_bits(offset) & 0xff;
+               } else if (idx_value & 0x100) {
+                       DRM_ERROR("cannot use PFP on REG wait\n");
+                       return -EINVAL;
                }
                break;
        case PACKET3_CP_DMA:
@@ -2282,7 +2094,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        }
                        /* src address space is memory */
                        if (((info & 0x60000000) >> 29) == 0) {
-                               r = evergreen_cs_packet_next_reloc(p, &reloc);
+                               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                                if (r) {
                                        DRM_ERROR("bad CP DMA SRC\n");
                                        return -EINVAL;
@@ -2320,7 +2132,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                                return -EINVAL;
                        }
                        if (((info & 0x00300000) >> 20) == 0) {
-                               r = evergreen_cs_packet_next_reloc(p, &reloc);
+                               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                                if (r) {
                                        DRM_ERROR("bad CP DMA DST\n");
                                        return -EINVAL;
@@ -2354,7 +2166,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                /* 0xffffffff/0x0 is flush all cache flag */
                if (radeon_get_ib_value(p, idx + 1) != 0xffffffff ||
                    radeon_get_ib_value(p, idx + 2) != 0) {
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                DRM_ERROR("bad SURFACE_SYNC\n");
                                return -EINVAL;
@@ -2370,7 +2182,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                if (pkt->count) {
                        uint64_t offset;
 
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                DRM_ERROR("bad EVENT_WRITE\n");
                                return -EINVAL;
@@ -2391,7 +2203,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        DRM_ERROR("bad EVENT_WRITE_EOP\n");
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad EVENT_WRITE_EOP\n");
                        return -EINVAL;
@@ -2413,7 +2225,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        DRM_ERROR("bad EVENT_WRITE_EOS\n");
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad EVENT_WRITE_EOS\n");
                        return -EINVAL;
@@ -2480,7 +2292,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        switch (G__SQ_CONSTANT_TYPE(radeon_get_ib_value(p, idx+1+(i*8)+7))) {
                        case SQ_TEX_VTX_VALID_TEXTURE:
                                /* tex base */
-                               r = evergreen_cs_packet_next_reloc(p, &reloc);
+                               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                                if (r) {
                                        DRM_ERROR("bad SET_RESOURCE (tex)\n");
                                        return -EINVAL;
@@ -2511,13 +2323,13 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
 
                                if ((tex_dim == SQ_TEX_DIM_2D_MSAA || tex_dim == SQ_TEX_DIM_2D_ARRAY_MSAA) &&
                                    !mip_address &&
-                                   !evergreen_cs_packet_next_is_pkt3_nop(p)) {
+                                   !radeon_cs_packet_next_is_pkt3_nop(p)) {
                                        /* MIP_ADDRESS should point to FMASK for an MSAA texture.
                                         * It should be 0 if FMASK is disabled. */
                                        moffset = 0;
                                        mipmap = NULL;
                                } else {
-                                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                                        if (r) {
                                                DRM_ERROR("bad SET_RESOURCE (tex)\n");
                                                return -EINVAL;
@@ -2536,7 +2348,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        {
                                uint64_t offset64;
                                /* vtx base */
-                               r = evergreen_cs_packet_next_reloc(p, &reloc);
+                               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                                if (r) {
                                        DRM_ERROR("bad SET_RESOURCE (vtx)\n");
                                        return -EINVAL;
@@ -2618,7 +2430,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                /* Updating memory at DST_ADDRESS. */
                if (idx_value & 0x1) {
                        u64 offset;
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                DRM_ERROR("bad STRMOUT_BUFFER_UPDATE (missing dst reloc)\n");
                                return -EINVAL;
@@ -2637,7 +2449,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                /* Reading data from SRC_ADDRESS. */
                if (((idx_value >> 1) & 0x3) == 2) {
                        u64 offset;
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                DRM_ERROR("bad STRMOUT_BUFFER_UPDATE (missing src reloc)\n");
                                return -EINVAL;
@@ -2662,7 +2474,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                        DRM_ERROR("bad MEM_WRITE (invalid count)\n");
                        return -EINVAL;
                }
-               r = evergreen_cs_packet_next_reloc(p, &reloc);
+               r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                if (r) {
                        DRM_ERROR("bad MEM_WRITE (missing reloc)\n");
                        return -EINVAL;
@@ -2691,7 +2503,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                if (idx_value & 0x1) {
                        u64 offset;
                        /* SRC is memory. */
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                DRM_ERROR("bad COPY_DW (missing src reloc)\n");
                                return -EINVAL;
@@ -2715,7 +2527,7 @@ static int evergreen_packet3_check(struct radeon_cs_parser *p,
                if (idx_value & 0x2) {
                        u64 offset;
                        /* DST is memory. */
-                       r = evergreen_cs_packet_next_reloc(p, &reloc);
+                       r = radeon_cs_packet_next_reloc(p, &reloc, 0);
                        if (r) {
                                DRM_ERROR("bad COPY_DW (missing dst reloc)\n");
                                return -EINVAL;
@@ -2819,7 +2631,7 @@ int evergreen_cs_parse(struct radeon_cs_parser *p)
                p->track = track;
        }
        do {
-               r = evergreen_cs_packet_parse(p, &pkt, p->idx);
+               r = radeon_cs_packet_parse(p, &pkt, p->idx);
                if (r) {
                        kfree(p->track);
                        p->track = NULL;
@@ -2827,12 +2639,12 @@ int evergreen_cs_parse(struct radeon_cs_parser *p)
                }
                p->idx += pkt.count + 2;
                switch (pkt.type) {
-               case PACKET_TYPE0:
+               case RADEON_PACKET_TYPE0:
                        r = evergreen_cs_parse_packet0(p, &pkt);
                        break;
-               case PACKET_TYPE2:
+               case RADEON_PACKET_TYPE2:
                        break;
-               case PACKET_TYPE3:
+               case RADEON_PACKET_TYPE3:
                        r = evergreen_packet3_check(p, &pkt);
                        break;
                default:
@@ -2858,16 +2670,6 @@ int evergreen_cs_parse(struct radeon_cs_parser *p)
        return 0;
 }
 
-/*
- *  DMA
- */
-
-#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28)
-#define GET_DMA_COUNT(h) ((h) & 0x000fffff)
-#define GET_DMA_T(h) (((h) & 0x00800000) >> 23)
-#define GET_DMA_NEW(h) (((h) & 0x04000000) >> 26)
-#define GET_DMA_MISC(h) (((h) & 0x0700000) >> 20)
-
 /**
  * evergreen_dma_cs_parse() - parse the DMA IB
  * @p:         parser structure holding parsing context.
@@ -2881,9 +2683,9 @@ int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
 {
        struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx];
        struct radeon_cs_reloc *src_reloc, *dst_reloc, *dst2_reloc;
-       u32 header, cmd, count, tiled, new_cmd, misc;
+       u32 header, cmd, count, sub_cmd;
        volatile u32 *ib = p->ib.ptr;
-       u32 idx, idx_value;
+       u32 idx;
        u64 src_offset, dst_offset, dst2_offset;
        int r;
 
@@ -2897,9 +2699,7 @@ int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
                header = radeon_get_ib_value(p, idx);
                cmd = GET_DMA_CMD(header);
                count = GET_DMA_COUNT(header);
-               tiled = GET_DMA_T(header);
-               new_cmd = GET_DMA_NEW(header);
-               misc = GET_DMA_MISC(header);
+               sub_cmd = GET_DMA_SUB_CMD(header);
 
                switch (cmd) {
                case DMA_PACKET_WRITE:
@@ -2908,19 +2708,27 @@ int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
                                DRM_ERROR("bad DMA_PACKET_WRITE\n");
                                return -EINVAL;
                        }
-                       if (tiled) {
+                       switch (sub_cmd) {
+                       /* tiled */
+                       case 8:
                                dst_offset = radeon_get_ib_value(p, idx+1);
                                dst_offset <<= 8;
 
                                ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
                                p->idx += count + 7;
-                       } else {
+                               break;
+                       /* linear */
+                       case 0:
                                dst_offset = radeon_get_ib_value(p, idx+1);
                                dst_offset |= ((u64)(radeon_get_ib_value(p, idx+2) & 0xff)) << 32;
 
                                ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
                                ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
                                p->idx += count + 3;
+                               break;
+                       default:
+                               DRM_ERROR("bad DMA_PACKET_WRITE [%6d] 0x%08x sub cmd is not 0 or 8\n", idx, header);
+                               return -EINVAL;
                        }
                        if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
                                dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n",
@@ -2939,338 +2747,330 @@ int evergreen_dma_cs_parse(struct radeon_cs_parser *p)
                                DRM_ERROR("bad DMA_PACKET_COPY\n");
                                return -EINVAL;
                        }
-                       if (tiled) {
-                               idx_value = radeon_get_ib_value(p, idx + 2);
-                               if (new_cmd) {
-                                       switch (misc) {
-                                       case 0:
-                                               /* L2T, frame to fields */
-                                               if (idx_value & (1 << 31)) {
-                                                       DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
-                                                       return -EINVAL;
-                                               }
-                                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-                                               if (r) {
-                                                       DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
-                                                       return -EINVAL;
-                                               }
-                                               dst_offset = radeon_get_ib_value(p, idx+1);
-                                               dst_offset <<= 8;
-                                               dst2_offset = radeon_get_ib_value(p, idx+2);
-                                               dst2_offset <<= 8;
-                                               src_offset = radeon_get_ib_value(p, idx+8);
-                                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+9) & 0xff)) << 32;
-                                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n",
-                                                                src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
-                                                                dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
-                                                                dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-                                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-                                               ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                               ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-                                               p->idx += 10;
-                                               break;
-                                       case 1:
-                                               /* L2T, T2L partial */
-                                               if (p->family < CHIP_CAYMAN) {
-                                                       DRM_ERROR("L2T, T2L Partial is cayman only !\n");
-                                                       return -EINVAL;
-                                               }
-                                               /* detile bit */
-                                               if (idx_value & (1 << 31)) {
-                                                       /* tiled src, linear dst */
-                                                       ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-                                                       ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-                                                       ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-                                               } else {
-                                                       /* linear src, tiled dst */
-                                                       ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                                       ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-                                                       ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-                                               }
-                                               p->idx += 12;
-                                               break;
-                                       case 3:
-                                               /* L2T, broadcast */
-                                               if (idx_value & (1 << 31)) {
-                                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-                                                       return -EINVAL;
-                                               }
-                                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-                                               if (r) {
-                                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-                                                       return -EINVAL;
-                                               }
-                                               dst_offset = radeon_get_ib_value(p, idx+1);
-                                               dst_offset <<= 8;
-                                               dst2_offset = radeon_get_ib_value(p, idx+2);
-                                               dst2_offset <<= 8;
-                                               src_offset = radeon_get_ib_value(p, idx+8);
-                                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+9) & 0xff)) << 32;
-                                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-                                                                src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-                                                                dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
-                                                                dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-                                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-                                               ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                               ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-                                               p->idx += 10;
-                                               break;
-                                       case 4:
-                                               /* L2T, T2L */
-                                               /* detile bit */
-                                               if (idx_value & (1 << 31)) {
-                                                       /* tiled src, linear dst */
-                                                       src_offset = radeon_get_ib_value(p, idx+1);
-                                                       src_offset <<= 8;
-                                                       ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-                                                       dst_offset = radeon_get_ib_value(p, idx+7);
-                                                       dst_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
-                                                       ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-                                                       ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-                                               } else {
-                                                       /* linear src, tiled dst */
-                                                       src_offset = radeon_get_ib_value(p, idx+7);
-                                                       src_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
-                                                       ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                                       ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-                                                       dst_offset = radeon_get_ib_value(p, idx+1);
-                                                       dst_offset <<= 8;
-                                                       ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-                                               }
-                                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n",
-                                                                src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n",
-                                                                dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               p->idx += 9;
-                                               break;
-                                       case 5:
-                                               /* T2T partial */
-                                               if (p->family < CHIP_CAYMAN) {
-                                                       DRM_ERROR("L2T, T2L Partial is cayman only !\n");
-                                                       return -EINVAL;
-                                               }
-                                               ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-                                               ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-                                               p->idx += 13;
-                                               break;
-                                       case 7:
-                                               /* L2T, broadcast */
-                                               if (idx_value & (1 << 31)) {
-                                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-                                                       return -EINVAL;
-                                               }
-                                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-                                               if (r) {
-                                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
-                                                       return -EINVAL;
-                                               }
-                                               dst_offset = radeon_get_ib_value(p, idx+1);
-                                               dst_offset <<= 8;
-                                               dst2_offset = radeon_get_ib_value(p, idx+2);
-                                               dst2_offset <<= 8;
-                                               src_offset = radeon_get_ib_value(p, idx+8);
-                                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+9) & 0xff)) << 32;
-                                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-                                                                src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-                                                                dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
-                                                                dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-                                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
-                                               ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                               ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-                                               p->idx += 10;
-                                               break;
-                                       default:
-                                               DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-                                               return -EINVAL;
-                                       }
+                       switch (sub_cmd) {
+                       /* Copy L2L, DW aligned */
+                       case 0x00:
+                               /* L2L, dw */
+                               src_offset = radeon_get_ib_value(p, idx+2);
+                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
+                               dst_offset = radeon_get_ib_value(p, idx+1);
+                               dst_offset |= ((u64)(radeon_get_ib_value(p, idx+3) & 0xff)) << 32;
+                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n",
+                                                       src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n",
+                                                       dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+                               ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+                               p->idx += 5;
+                               break;
+                       /* Copy L2T/T2L */
+                       case 0x08:
+                               /* detile bit */
+                               if (radeon_get_ib_value(p, idx + 2) & (1 << 31)) {
+                                       /* tiled src, linear dst */
+                                       src_offset = radeon_get_ib_value(p, idx+1);
+                                       src_offset <<= 8;
+                                       ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+                                       dst_offset = radeon_get_ib_value(p, idx + 7);
+                                       dst_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
+                                       ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+                                       ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
                                } else {
-                                       switch (misc) {
-                                       case 0:
-                                               /* detile bit */
-                                               if (idx_value & (1 << 31)) {
-                                                       /* tiled src, linear dst */
-                                                       src_offset = radeon_get_ib_value(p, idx+1);
-                                                       src_offset <<= 8;
-                                                       ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
-
-                                                       dst_offset = radeon_get_ib_value(p, idx+7);
-                                                       dst_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
-                                                       ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-                                                       ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-                                               } else {
-                                                       /* linear src, tiled dst */
-                                                       src_offset = radeon_get_ib_value(p, idx+7);
-                                                       src_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
-                                                       ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                                       ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-
-                                                       dst_offset = radeon_get_ib_value(p, idx+1);
-                                                       dst_offset <<= 8;
-                                                       ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
-                                               }
-                                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
-                                                                src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
-                                                                dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               p->idx += 9;
-                                               break;
-                                       default:
-                                               DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-                                               return -EINVAL;
-                                       }
+                                       /* linear src, tiled dst */
+                                       src_offset = radeon_get_ib_value(p, idx+7);
+                                       src_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
+                                       ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                                       ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
+                                       dst_offset = radeon_get_ib_value(p, idx+1);
+                                       dst_offset <<= 8;
+                                       ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
                                }
-                       } else {
-                               if (new_cmd) {
-                                       switch (misc) {
-                                       case 0:
-                                               /* L2L, byte */
-                                               src_offset = radeon_get_ib_value(p, idx+2);
-                                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
-                                               dst_offset = radeon_get_ib_value(p, idx+1);
-                                               dst_offset |= ((u64)(radeon_get_ib_value(p, idx+3) & 0xff)) << 32;
-                                               if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n",
-                                                                src_offset + count, radeon_bo_size(src_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n",
-                                                                dst_offset + count, radeon_bo_size(dst_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
-                                               ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
-                                               ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-                                               ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-                                               p->idx += 5;
-                                               break;
-                                       case 1:
-                                               /* L2L, partial */
-                                               if (p->family < CHIP_CAYMAN) {
-                                                       DRM_ERROR("L2L Partial is cayman only !\n");
-                                                       return -EINVAL;
-                                               }
-                                               ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
-                                               ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-                                               ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
-                                               ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-
-                                               p->idx += 9;
-                                               break;
-                                       case 4:
-                                               /* L2L, dw, broadcast */
-                                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
-                                               if (r) {
-                                                       DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n");
-                                                       return -EINVAL;
-                                               }
-                                               dst_offset = radeon_get_ib_value(p, idx+1);
-                                               dst_offset |= ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
-                                               dst2_offset = radeon_get_ib_value(p, idx+2);
-                                               dst2_offset |= ((u64)(radeon_get_ib_value(p, idx+5) & 0xff)) << 32;
-                                               src_offset = radeon_get_ib_value(p, idx+3);
-                                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+6) & 0xff)) << 32;
-                                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n",
-                                                                src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n",
-                                                                dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
-                                                       dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n",
-                                                                dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
-                                                       return -EINVAL;
-                                               }
-                                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-                                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc);
-                                               ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                               ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-                                               ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff;
-                                               ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-                                               p->idx += 7;
-                                               break;
-                                       default:
-                                               DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-                                               return -EINVAL;
-                                       }
+                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, src buffer too small (%llu %lu)\n",
+                                                       src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, dst buffer too small (%llu %lu)\n",
+                                                       dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               p->idx += 9;
+                               break;
+                       /* Copy L2L, byte aligned */
+                       case 0x40:
+                               /* L2L, byte */
+                               src_offset = radeon_get_ib_value(p, idx+2);
+                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
+                               dst_offset = radeon_get_ib_value(p, idx+1);
+                               dst_offset |= ((u64)(radeon_get_ib_value(p, idx+3) & 0xff)) << 32;
+                               if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n",
+                                                       src_offset + count, radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n",
+                                                       dst_offset + count, radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
+                               ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
+                               ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+                               ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+                               p->idx += 5;
+                               break;
+                       /* Copy L2L, partial */
+                       case 0x41:
+                               /* L2L, partial */
+                               if (p->family < CHIP_CAYMAN) {
+                                       DRM_ERROR("L2L Partial is cayman only !\n");
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff);
+                               ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+                               ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff);
+                               ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+
+                               p->idx += 9;
+                               break;
+                       /* Copy L2L, DW aligned, broadcast */
+                       case 0x44:
+                               /* L2L, dw, broadcast */
+                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+                               if (r) {
+                                       DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n");
+                                       return -EINVAL;
+                               }
+                               dst_offset = radeon_get_ib_value(p, idx+1);
+                               dst_offset |= ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
+                               dst2_offset = radeon_get_ib_value(p, idx+2);
+                               dst2_offset |= ((u64)(radeon_get_ib_value(p, idx+5) & 0xff)) << 32;
+                               src_offset = radeon_get_ib_value(p, idx+3);
+                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+6) & 0xff)) << 32;
+                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n",
+                                                       src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n",
+                                                       dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n",
+                                                       dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+                               ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff;
+                               ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+                               p->idx += 7;
+                               break;
+                       /* Copy L2T Frame to Field */
+                       case 0x48:
+                               if (radeon_get_ib_value(p, idx + 2) & (1 << 31)) {
+                                       DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
+                                       return -EINVAL;
+                               }
+                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+                               if (r) {
+                                       DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n");
+                                       return -EINVAL;
+                               }
+                               dst_offset = radeon_get_ib_value(p, idx+1);
+                               dst_offset <<= 8;
+                               dst2_offset = radeon_get_ib_value(p, idx+2);
+                               dst2_offset <<= 8;
+                               src_offset = radeon_get_ib_value(p, idx+8);
+                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+9) & 0xff)) << 32;
+                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n",
+                                                       src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
+                                                       dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n",
+                                                       dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+                               ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+                               p->idx += 10;
+                               break;
+                       /* Copy L2T/T2L, partial */
+                       case 0x49:
+                               /* L2T, T2L partial */
+                               if (p->family < CHIP_CAYMAN) {
+                                       DRM_ERROR("L2T, T2L Partial is cayman only !\n");
+                                       return -EINVAL;
+                               }
+                               /* detile bit */
+                               if (radeon_get_ib_value(p, idx + 2) & (1 << 31)) {
+                                       /* tiled src, linear dst */
+                                       ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+                                       ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+                                       ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
+                               } else {
+                                       /* linear src, tiled dst */
+                                       ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                                       ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
+                                       ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+                               }
+                               p->idx += 12;
+                               break;
+                       /* Copy L2T broadcast */
+                       case 0x4b:
+                               /* L2T, broadcast */
+                               if (radeon_get_ib_value(p, idx + 2) & (1 << 31)) {
+                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+                                       return -EINVAL;
+                               }
+                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+                               if (r) {
+                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+                                       return -EINVAL;
+                               }
+                               dst_offset = radeon_get_ib_value(p, idx+1);
+                               dst_offset <<= 8;
+                               dst2_offset = radeon_get_ib_value(p, idx+2);
+                               dst2_offset <<= 8;
+                               src_offset = radeon_get_ib_value(p, idx+8);
+                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+9) & 0xff)) << 32;
+                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
+                                                       src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
+                                                       dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
+                                                       dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+                               ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+                               p->idx += 10;
+                               break;
+                       /* Copy L2T/T2L (tile units) */
+                       case 0x4c:
+                               /* L2T, T2L */
+                               /* detile bit */
+                               if (radeon_get_ib_value(p, idx + 2) & (1 << 31)) {
+                                       /* tiled src, linear dst */
+                                       src_offset = radeon_get_ib_value(p, idx+1);
+                                       src_offset <<= 8;
+                                       ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+
+                                       dst_offset = radeon_get_ib_value(p, idx+7);
+                                       dst_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
+                                       ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
+                                       ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
                                } else {
-                                       /* L2L, dw */
-                                       src_offset = radeon_get_ib_value(p, idx+2);
-                                       src_offset |= ((u64)(radeon_get_ib_value(p, idx+4) & 0xff)) << 32;
+                                       /* linear src, tiled dst */
+                                       src_offset = radeon_get_ib_value(p, idx+7);
+                                       src_offset |= ((u64)(radeon_get_ib_value(p, idx+8) & 0xff)) << 32;
+                                       ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                                       ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+
                                        dst_offset = radeon_get_ib_value(p, idx+1);
-                                       dst_offset |= ((u64)(radeon_get_ib_value(p, idx+3) & 0xff)) << 32;
-                                       if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
-                                               dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n",
-                                                        src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
-                                               return -EINVAL;
-                                       }
-                                       if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
-                                               dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n",
-                                                        dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
-                                               return -EINVAL;
-                                       }
-                                       ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
-                                       ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
-                                       ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
-                                       ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
-                                       p->idx += 5;
+                                       dst_offset <<= 8;
+                                       ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
                                }
+                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n",
+                                                       src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n",
+                                                       dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               p->idx += 9;
+                               break;
+                       /* Copy T2T, partial (tile units) */
+                       case 0x4d:
+                               /* T2T partial */
+                               if (p->family < CHIP_CAYMAN) {
+                                       DRM_ERROR("L2T, T2L Partial is cayman only !\n");
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
+                               ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+                               p->idx += 13;
+                               break;
+                       /* Copy L2T broadcast (tile units) */
+                       case 0x4f:
+                               /* L2T, broadcast */
+                               if (radeon_get_ib_value(p, idx + 2) & (1 << 31)) {
+                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+                                       return -EINVAL;
+                               }
+                               r = r600_dma_cs_next_reloc(p, &dst2_reloc);
+                               if (r) {
+                                       DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n");
+                                       return -EINVAL;
+                               }
+                               dst_offset = radeon_get_ib_value(p, idx+1);
+                               dst_offset <<= 8;
+                               dst2_offset = radeon_get_ib_value(p, idx+2);
+                               dst2_offset <<= 8;
+                               src_offset = radeon_get_ib_value(p, idx+8);
+                               src_offset |= ((u64)(radeon_get_ib_value(p, idx+9) & 0xff)) << 32;
+                               if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n",
+                                                       src_offset + (count * 4), radeon_bo_size(src_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n",
+                                                       dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) {
+                                       dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n",
+                                                       dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj));
+                                       return -EINVAL;
+                               }
+                               ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
+                               ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8);
+                               ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
+                               ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
+                               p->idx += 10;
+                               break;
+                       default:
+                               DRM_ERROR("bad DMA_PACKET_COPY [%6d] 0x%08x invalid sub cmd\n", idx, header);
+                               return -EINVAL;
                        }
                        break;
                case DMA_PACKET_CONSTANT_FILL:
@@ -3583,19 +3383,19 @@ int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
 
        do {
                pkt.idx = idx;
-               pkt.type = CP_PACKET_GET_TYPE(ib->ptr[idx]);
-               pkt.count = CP_PACKET_GET_COUNT(ib->ptr[idx]);
+               pkt.type = RADEON_CP_PACKET_GET_TYPE(ib->ptr[idx]);
+               pkt.count = RADEON_CP_PACKET_GET_COUNT(ib->ptr[idx]);
                pkt.one_reg_wr = 0;
                switch (pkt.type) {
-               case PACKET_TYPE0:
+               case RADEON_PACKET_TYPE0:
                        dev_err(rdev->dev, "Packet0 not allowed!\n");
                        ret = -EINVAL;
                        break;
-               case PACKET_TYPE2:
+               case RADEON_PACKET_TYPE2:
                        idx += 1;
                        break;
-               case PACKET_TYPE3:
-                       pkt.opcode = CP_PACKET3_GET_OPCODE(ib->ptr[idx]);
+               case RADEON_PACKET_TYPE3:
+                       pkt.opcode = RADEON_CP_PACKET3_GET_OPCODE(ib->ptr[idx]);
                        ret = evergreen_vm_packet3_check(rdev, ib->ptr, &pkt);
                        idx += pkt.count + 2;
                        break;
@@ -3623,88 +3423,79 @@ int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
 int evergreen_dma_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
 {
        u32 idx = 0;
-       u32 header, cmd, count, tiled, new_cmd, misc;
+       u32 header, cmd, count, sub_cmd;
 
        do {
                header = ib->ptr[idx];
                cmd = GET_DMA_CMD(header);
                count = GET_DMA_COUNT(header);
-               tiled = GET_DMA_T(header);
-               new_cmd = GET_DMA_NEW(header);
-               misc = GET_DMA_MISC(header);
+               sub_cmd = GET_DMA_SUB_CMD(header);
 
                switch (cmd) {
                case DMA_PACKET_WRITE:
-                       if (tiled)
+                       switch (sub_cmd) {
+                       /* tiled */
+                       case 8:
                                idx += count + 7;
-                       else
+                               break;
+                       /* linear */
+                       case 0:
                                idx += count + 3;
+                               break;
+                       default:
+                               DRM_ERROR("bad DMA_PACKET_WRITE [%6d] 0x%08x sub cmd is not 0 or 8\n", idx, ib->ptr[idx]);
+                               return -EINVAL;
+                       }
                        break;
                case DMA_PACKET_COPY:
-                       if (tiled) {
-                               if (new_cmd) {
-                                       switch (misc) {
-                                       case 0:
-                                               /* L2T, frame to fields */
-                                               idx += 10;
-                                               break;
-                                       case 1:
-                                               /* L2T, T2L partial */
-                                               idx += 12;
-                                               break;
-                                       case 3:
-                                               /* L2T, broadcast */
-                                               idx += 10;
-                                               break;
-                                       case 4:
-                                               /* L2T, T2L */
-                                               idx += 9;
-                                               break;
-                                       case 5:
-                                               /* T2T partial */
-                                               idx += 13;
-                                               break;
-                                       case 7:
-                                               /* L2T, broadcast */
-                                               idx += 10;
-                                               break;
-                                       default:
-                                               DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-                                               return -EINVAL;
-                                       }
-                               } else {
-                                       switch (misc) {
-                                       case 0:
-                                               idx += 9;
-                                               break;
-                                       default:
-                                               DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-                                               return -EINVAL;
-                                       }
-                               }
-                       } else {
-                               if (new_cmd) {
-                                       switch (misc) {
-                                       case 0:
-                                               /* L2L, byte */
-                                               idx += 5;
-                                               break;
-                                       case 1:
-                                               /* L2L, partial */
-                                               idx += 9;
-                                               break;
-                                       case 4:
-                                               /* L2L, dw, broadcast */
-                                               idx += 7;
-                                               break;
-                                       default:
-                                               DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc);
-                                               return -EINVAL;
-                                       }
-                               } else {
-                                       /* L2L, dw */
-                                       idx += 5;
-                               }
+                       switch (sub_cmd) {
+                       /* Copy L2L, DW aligned */
+                       case 0x00:
+                               idx += 5;
+                               break;
+                       /* Copy L2T/T2L */
+                       case 0x08:
+                               idx += 9;
+                               break;
+                       /* Copy L2L, byte aligned */
+                       case 0x40:
+                               idx += 5;
+                               break;
+                       /* Copy L2L, partial */
+                       case 0x41:
+                               idx += 9;
+                               break;
+                       /* Copy L2L, DW aligned, broadcast */
+                       case 0x44:
+                               idx += 7;
+                               break;
+                       /* Copy L2T Frame to Field */
+                       case 0x48:
+                               idx += 10;
+                               break;
+                       /* Copy L2T/T2L, partial */
+                       case 0x49:
+                               idx += 12;
+                               break;
+                       /* Copy L2T broadcast */
+                       case 0x4b:
+                               idx += 10;
+                               break;
+                       /* Copy L2T/T2L (tile units) */
+                       case 0x4c:
+                               idx += 9;
+                               break;
+                       /* Copy T2T, partial (tile units) */
+                       case 0x4d:
+                               idx += 13;
+                               break;
+                       /* Copy L2T broadcast (tile units) */
+                       case 0x4f:
+                               idx += 10;
+                               break;
+                       default:
+                               DRM_ERROR("bad DMA_PACKET_COPY [%6d] 0x%08x invalid sub cmd\n", idx, ib->ptr[idx]);
+                               return -EINVAL;
                        }
                        break;
                case DMA_PACKET_CONSTANT_FILL: