pixman git: add more NEON patches, bump SRCREV
authorKoen Kooi <koen@openembedded.org>
Sun, 1 Nov 2009 19:01:27 +0000 (20:01 +0100)
committerKoen Kooi <koen@openembedded.org>
Sun, 1 Nov 2009 19:01:27 +0000 (20:01 +0100)
recipes/xorg-lib/pixman/neon-24bpp.patch [new file with mode: 0644]
recipes/xorg-lib/pixman/prefetch.patch [new file with mode: 0644]
recipes/xorg-lib/pixman_git.bb

diff --git a/recipes/xorg-lib/pixman/neon-24bpp.patch b/recipes/xorg-lib/pixman/neon-24bpp.patch
new file mode 100644 (file)
index 0000000..edfd367
--- /dev/null
@@ -0,0 +1,264 @@
+From b101c115102b83bb1fc4e28de6136dd4940796bc Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Fri, 30 Oct 2009 17:02:14 +0000
+Subject: ARM: initial 24bpp support
+
+---
+diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
+index 35e6a7e..7f91ced 100644
+--- a/pixman/pixman-arm-neon-asm.S
++++ b/pixman/pixman-arm-neon-asm.S
+@@ -977,3 +977,32 @@ generate_composite_function \
+     pixman_composite_over_8888_n_8888_process_pixblock_head, \
+     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+     pixman_composite_over_8888_n_8888_process_pixblock_tail_head
++
++/******************************************************************************/
++
++.macro pixman_composite_src_0888_0888_process_pixblock_head
++.endm
++
++.macro pixman_composite_src_0888_0888_process_pixblock_tail
++.endm
++
++.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
++    vst3.8 {d0, d1, d2}, [DST_W]!
++    vld3.8 {d0, d1, d2}, [SRC]!
++    cache_preload 8, 8
++.endm
++
++generate_composite_function \
++    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
++    FLAG_DST_WRITEONLY, \
++    8, /* number of pixels, processed in a single block */ \
++    10, /* prefetch distance */ \
++    default_init, \
++    default_cleanup, \
++    pixman_composite_src_0888_0888_process_pixblock_head, \
++    pixman_composite_src_0888_0888_process_pixblock_tail, \
++    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
++    0, /* dst_w_basereg */ \
++    0, /* dst_r_basereg */ \
++    0, /* src_basereg   */ \
++    0  /* mask_basereg  */
+diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
+index a2941ae..1653ef4 100644
+--- a/pixman/pixman-arm-neon-asm.h
++++ b/pixman/pixman-arm-neon-asm.h
+@@ -95,6 +95,14 @@
+     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+ .endm
++.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
++    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
++.endm
++
++.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
++    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
++.endm
++
+ .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+ .if numbytes == 32
+     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+@@ -134,6 +142,18 @@
+ .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+                       %(basereg+6), %(basereg+7), mem_operand, abits
++.elseif (bpp == 24) && (numpix == 8)
++    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
++.elseif (bpp == 24) && (numpix == 4)
++    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
++    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
++    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
++    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
++.elseif (bpp == 24) && (numpix == 2)
++    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
++    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
++.elseif (bpp == 24) && (numpix == 1)
++    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+ .else
+     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+ .endif
+@@ -145,6 +165,18 @@
+ .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+                       %(basereg+6), %(basereg+7), mem_operand, abits
++.elseif (bpp == 24) && (numpix == 8)
++    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
++.elseif (bpp == 24) && (numpix == 4)
++    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
++    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
++    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
++    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
++.elseif (bpp == 24) && (numpix == 2)
++    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
++    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
++.elseif (bpp == 24) && (numpix == 1)
++    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+ .else
+     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+ .endif
+@@ -334,6 +366,8 @@ fname:
+ .if src_bpp == 32
+     .set src_bpp_shift, 2
++.elseif src_bpp == 24
++    .set src_bpp_shift, 0
+ .elseif src_bpp == 16
+     .set src_bpp_shift, 1
+ .elseif src_bpp == 8
+@@ -345,6 +379,8 @@ fname:
+ .endif
+ .if mask_bpp == 32
+     .set mask_bpp_shift, 2
++.elseif mask_bpp == 24
++    .set mask_bpp_shift, 0
+ .elseif mask_bpp == 8
+     .set mask_bpp_shift, 0
+ .elseif mask_bpp == 0
+@@ -354,6 +390,8 @@ fname:
+ .endif
+ .if dst_w_bpp == 32
+     .set dst_bpp_shift, 2
++.elseif dst_w_bpp == 24
++    .set dst_bpp_shift, 0
+ .elseif dst_w_bpp == 16
+     .set dst_bpp_shift, 1
+ .elseif dst_w_bpp == 8
+@@ -398,6 +436,19 @@ fname:
+     PF mov      PF_CTL, H, lsl #4
+     PF add      PF_CTL, #(prefetch_distance - 0x10)
++.if src_bpp == 24
++    sub         SRC_STRIDE, SRC_STRIDE, W
++    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
++.endif
++.if mask_bpp == 24
++    sub         MASK_STRIDE, MASK_STRIDE, W
++    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
++.endif
++.if dst_w_bpp == 24
++    sub         DST_STRIDE, DST_STRIDE, W
++    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
++.endif
++
+     init
+ .if regs_shortage
+     push        {r0, r1}
+@@ -412,7 +463,8 @@ fname:
+     cmp         W, #(pixblock_size * 2)
+     blt         8f
+ 0:
+-    /* ensure 16 byte alignment of the destination buffer */
++    /* ensure 16 byte alignment of the destination buffer, except for 24bpp */
++.if dst_w_bpp != 24
+     tst         DST_R, #0xF
+     beq         2f
+@@ -454,6 +506,7 @@ fname:
+ .endif
+ .endr
+ 2:
++.endif
+     pixld_a     pixblock_size, dst_r_bpp, \
+                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+@@ -520,11 +573,13 @@ fname:
+ .if mask_bpp != 0
+     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+ .endif
++.if (dst_w_bpp != 24)
+     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+-.if src_bpp != 0
++.endif
++.if (src_bpp != 24) && (src_bpp != 0)
+     sub         SRC, SRC, W, lsl #src_bpp_shift
+ .endif
+-.if mask_bpp != 0
++.if (mask_bpp != 24) && (mask_bpp != 0)
+     sub         MASK, MASK, W, lsl #mask_bpp_shift
+ .endif
+     subs        H, H, #1
+@@ -539,7 +594,7 @@ fname:
+     cleanup
+     pop         {r4-r12, pc}  /* exit */
+-8: /* handle small rectangle, width up to 15 pixels */
++8: /* handle small rectangle, width up to (pixblock_size * 2 - 1) pixels */
+     tst         W, #pixblock_size
+     beq         1f
+     pixld       pixblock_size, dst_r_bpp, \
+@@ -592,11 +647,13 @@ fname:
+ .if mask_bpp != 0
+     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+ .endif
++.if (dst_w_bpp != 24)
+     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+-.if src_bpp != 0
++.endif
++.if (src_bpp != 24) && (src_bpp != 0)
+     sub         SRC, SRC, W, lsl #src_bpp_shift
+ .endif
+-.if mask_bpp != 0
++.if (mask_bpp != 24) && (mask_bpp != 0)
+     sub         MASK, MASK, W, lsl #mask_bpp_shift
+ .endif
+     subs        H, H, #1
+diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
+index 2811099..f3f38a9 100644
+--- a/pixman/pixman-arm-neon.c
++++ b/pixman/pixman-arm-neon.c
+@@ -2065,6 +2065,43 @@ neon_composite_src_8888_8888 (pixman_implementation_t *imp,
+ }
+ void
++pixman_composite_src_0888_0888_asm_neon (int32_t   w,
++                                         int32_t   h,
++                                         uint8_t  *dst,
++                                         int32_t   dst_stride,
++                                         uint8_t  *src,
++                                         int32_t   src_stride);
++
++static void
++neon_composite_src_0888_0888 (pixman_implementation_t *imp,
++                              pixman_op_t              op,
++                              pixman_image_t *         src_image,
++                              pixman_image_t *         mask_image,
++                              pixman_image_t *         dst_image,
++                              int32_t                  src_x,
++                              int32_t                  src_y,
++                              int32_t                  mask_x,
++                              int32_t                  mask_y,
++                              int32_t                  dest_x,
++                              int32_t                  dest_y,
++                              int32_t                  width,
++                              int32_t                  height)
++{
++    uint8_t *dst_line;
++    uint8_t *src_line;
++    int32_t dst_stride, src_stride;
++
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t,
++                           src_stride, src_line, 3);
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t,
++                           dst_stride, dst_line, 3);
++
++    pixman_composite_src_0888_0888_asm_neon (width, height,
++                                             dst_line, dst_stride,
++                                             src_line, src_stride);
++}
++
++void
+ pixman_composite_over_8888_8888_asm_neon (int32_t   w,
+                                           int32_t   h,
+                                           uint32_t *dst,
+@@ -2449,6 +2486,7 @@ static const pixman_fast_path_t arm_neon_fast_path_array[] =
+     { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_src_8888_8888,    0 },
+     { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_src_8888_8888,    0 },
+     { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_0565_0565,    0 },
++    { PIXMAN_OP_SRC,  PIXMAN_r8g8b8,   PIXMAN_null,     PIXMAN_r8g8b8,   neon_composite_src_0888_0888,    0 },
+     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
+     { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
+     { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
+--
+cgit v0.8.2
diff --git a/recipes/xorg-lib/pixman/prefetch.patch b/recipes/xorg-lib/pixman/prefetch.patch
new file mode 100644 (file)
index 0000000..c2e856e
--- /dev/null
@@ -0,0 +1,298 @@
+From d0044bfbd596f22ed1560579ea6537b39f3dc1af Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Thu, 29 Oct 2009 19:06:42 +0000
+Subject: ARM: Don't emit prefetch code if prefetch distance is set to 0
+
+Also it is now possible to disable prefetch globally with
+a configuration macro
+---
+diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
+index bca499a..35e6a7e 100644
+--- a/pixman/pixman-arm-neon-asm.S
++++ b/pixman/pixman-arm-neon-asm.S
+@@ -219,33 +219,33 @@
+     vshrn.u16   d7, q2, #3
+     vsli.u16    q2, q2, #5
+         vshll.u8    q14, d16, #8
+-                                    add PF_X, PF_X, #8
++                                    PF add PF_X, PF_X, #8
+         vshll.u8    q8, d19, #8
+-                                    tst PF_CTL, #0xF
++                                    PF tst PF_CTL, #0xF
+     vsri.u8     d6, d6, #5
+-                                    addne PF_X, PF_X, #8
++                                    PF addne PF_X, PF_X, #8
+     vmvn.8      d3, d3
+-                                    subne PF_CTL, PF_CTL, #1
++                                    PF subne PF_CTL, PF_CTL, #1
+     vsri.u8     d7, d7, #6
+     vshrn.u16   d30, q2, #2
+     vmull.u8    q10, d3, d6
+-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
++                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+     vmull.u8    q11, d3, d7
+     vmull.u8    q12, d3, d30
+-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
++                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+         vsri.u16    q14, q8, #5
+-                                    cmp PF_X, ORIG_W
++                                    PF cmp PF_X, ORIG_W
+         vshll.u8    q9, d18, #8
+     vrshr.u16   q13, q10, #8
+-                                    subge PF_X, PF_X, ORIG_W
++                                    PF subge PF_X, PF_X, ORIG_W
+     vrshr.u16   q3, q11, #8
+     vrshr.u16   q15, q12, #8
+-                                    subges PF_CTL, PF_CTL, #0x10
++                                    PF subges PF_CTL, PF_CTL, #0x10
+         vsri.u16    q14, q9, #11
+-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+     vraddhn.u16 d20, q10, q13
+     vraddhn.u16 d23, q11, q3
+-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+     vraddhn.u16 d22, q12, q15
+         vst1.16     {d28, d29}, [DST_W, :128]!
+ .endm
+@@ -323,20 +323,20 @@ generate_composite_function \
+ .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+         vsri.u16    q14, q8, #5
+-                                    add PF_X, PF_X, #8
+-                                    tst PF_CTL, #0xF
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
+     vld4.8      {d0, d1, d2, d3}, [SRC]!
+-                                    addne PF_X, PF_X, #8
+-                                    subne PF_CTL, PF_CTL, #1
++                                    PF addne PF_X, PF_X, #8
++                                    PF subne PF_CTL, PF_CTL, #1
+         vsri.u16    q14, q9, #11
+-                                    cmp PF_X, ORIG_W
+-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
++                                    PF cmp PF_X, ORIG_W
++                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+     vshll.u8    q8, d1, #8
+         vst1.16     {d28, d29}, [DST_W, :128]!
+-                                    subge PF_X, PF_X, ORIG_W
+-                                    subges PF_CTL, PF_CTL, #0x10
++                                    PF subge PF_X, PF_X, ORIG_W
++                                    PF subges PF_CTL, PF_CTL, #0x10
+     vshll.u8    q14, d2, #8
+-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+     vshll.u8    q9, d0, #8
+ .endm
+@@ -363,20 +363,20 @@ generate_composite_function \
+ .macro pixman_composite_add_8000_8000_process_pixblock_tail_head
+     vld1.8      {d0, d1, d2, d3}, [SRC]!
+-                                    add PF_X, PF_X, #32
+-                                    tst PF_CTL, #0xF
++                                    PF add PF_X, PF_X, #32
++                                    PF tst PF_CTL, #0xF
+     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+-                                    addne PF_X, PF_X, #32
+-                                    subne PF_CTL, PF_CTL, #1
++                                    PF addne PF_X, PF_X, #32
++                                    PF subne PF_CTL, PF_CTL, #1
+         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+-                                    cmp PF_X, ORIG_W
+-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
+-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
+-                                    subge PF_X, PF_X, ORIG_W
+-                                    subges PF_CTL, PF_CTL, #0x10
++                                    PF cmp PF_X, ORIG_W
++                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
++                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
++                                    PF subge PF_X, PF_X, ORIG_W
++                                    PF subges PF_CTL, PF_CTL, #0x10
+     vqadd.u8    q14, q0, q2
+-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+     vqadd.u8    q15, q1, q3
+ .endm
+@@ -418,32 +418,32 @@ generate_composite_function \
+ .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+         vrshr.u16   q14, q8, #8
+-                                    add PF_X, PF_X, #8
+-                                    tst PF_CTL, #0xF
++                                    PF add PF_X, PF_X, #8
++                                    PF tst PF_CTL, #0xF
+         vrshr.u16   q15, q9, #8
+         vrshr.u16   q12, q10, #8
+         vrshr.u16   q13, q11, #8
+-                                    addne PF_X, PF_X, #8
+-                                    subne PF_CTL, PF_CTL, #1
++                                    PF addne PF_X, PF_X, #8
++                                    PF subne PF_CTL, PF_CTL, #1
+         vraddhn.u16 d28, q14, q8
+         vraddhn.u16 d29, q15, q9
+-                                    cmp PF_X, ORIG_W
++                                    PF cmp PF_X, ORIG_W
+         vraddhn.u16 d30, q12, q10
+         vraddhn.u16 d31, q13, q11
+         vqadd.u8    q14, q0, q14
+         vqadd.u8    q15, q1, q15
+     vld4.8      {d0, d1, d2, d3}, [SRC]!
+-                                    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
++                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+     vmvn.8      d22, d3
+-                                    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
++                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+-                                    subge PF_X, PF_X, ORIG_W
++                                    PF subge PF_X, PF_X, ORIG_W
+     vmull.u8    q8, d22, d4
+-                                    subges PF_CTL, PF_CTL, #0x10
++                                    PF subges PF_CTL, PF_CTL, #0x10
+     vmull.u8    q9, d22, d5
+-                                    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+     vmull.u8    q10, d22, d6
+-                                    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+     vmull.u8    q11, d22, d7
+ .endm
+diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
+index d276ab9..a2941ae 100644
+--- a/pixman/pixman-arm-neon-asm.h
++++ b/pixman/pixman-arm-neon-asm.h
+@@ -58,6 +58,11 @@
+ #define RESPECT_STRICT_ALIGNMENT 1
+ /*
++ * If set to nonzero value, prefetch is globally disabled
++ */
++#define PREFETCH_GLOBALLY_DISABLED 0 
++
++/*
+  * Definitions of supplementary pixld/pixst macros (for partial load/store of
+  * pixel data)
+  */
+@@ -218,37 +223,43 @@
+  * pixels processing like simple copy. Anyway, having prefetch is a must
+  * when working with graphics data.
+  */
++.macro PF a, x:vararg
++.if (ADVANCED_PREFETCH_ENABLED != 0) && (PREFETCH_GLOBALLY_DISABLED == 0)
++    a x
++.endif
++.endm
++
+ .macro cache_preload std_increment, boost_increment
+ .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+ .if regs_shortage
+-    ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
++    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+ .endif
+ .if std_increment != 0
+-    add PF_X, PF_X, #std_increment
++    PF add PF_X, PF_X, #std_increment
+ .endif
+-    tst PF_CTL, #0xF
+-    addne PF_X, PF_X, #boost_increment
+-    subne PF_CTL, PF_CTL, #1
+-    cmp PF_X, ORIG_W
++    PF tst PF_CTL, #0xF
++    PF addne PF_X, PF_X, #boost_increment
++    PF subne PF_CTL, PF_CTL, #1
++    PF cmp PF_X, ORIG_W
+ .if src_bpp_shift >= 0
+-    pld [PF_SRC, PF_X, lsl #src_bpp_shift]
++    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+ .endif
+ .if dst_r_bpp != 0
+-    pld [PF_DST, PF_X, lsl #dst_bpp_shift]
++    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+ .endif
+ .if mask_bpp_shift >= 0
+-    pld [PF_MASK, PF_X, lsl #mask_bpp_shift]
++    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ .endif
+-    subge PF_X, PF_X, ORIG_W
+-    subges PF_CTL, PF_CTL, #0x10
++    PF subge PF_X, PF_X, ORIG_W
++    PF subges PF_CTL, PF_CTL, #0x10
+ .if src_bpp_shift >= 0
+-    ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
++    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+ .endif
+ .if dst_r_bpp != 0
+-    ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
++    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+ .endif
+ .if mask_bpp_shift >= 0
+-    ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
++    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ .endif
+ .endif
+ .endm
+@@ -297,6 +308,12 @@ fname:
+     PF_DST      .req        r12
+     PF_MASK     .req        r14
++.if prefetch_distance == 0
++    .set ADVANCED_PREFETCH_ENABLED, 0
++.else
++    .set ADVANCED_PREFETCH_ENABLED, 1
++.endif
++
+ .if mask_bpp == 0
+     ORIG_W      .req        r7      /* saved original width */
+     DUMMY       .req        r8      /* temporary register */
+@@ -374,12 +391,12 @@ fname:
+     ldr         MASK_STRIDE, [sp, #52]
+ .endif
+     mov         DST_R, DST_W
+-    mov         PF_SRC, SRC
+-    mov         PF_DST, DST_R
+-    mov         PF_MASK, MASK
+-    mov         PF_CTL, H, lsl #4
+-    /* pf_ctl = 10 | ((h - 1) << 4) */
+-    add         PF_CTL, #(prefetch_distance - 0x10)
++    PF mov      PF_SRC, SRC
++    PF mov      PF_DST, DST_R
++    PF mov      PF_MASK, MASK
++    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
++    PF mov      PF_CTL, H, lsl #4
++    PF add      PF_CTL, #(prefetch_distance - 0x10)
+     init
+ .if regs_shortage
+@@ -412,7 +429,7 @@ fname:
+ .else
+     add         DST_R, DST_R, #lowbit
+ .endif
+-    add         PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
++    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+     sub         W, W, #(lowbit * 8 / dst_w_bpp)
+ 1:
+ .endif
+@@ -444,7 +461,7 @@ fname:
+                 (src_basereg - pixblock_size * src_bpp / 64), SRC
+     pixld       pixblock_size, mask_bpp, \
+                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+-    add         PF_X, PF_X, #pixblock_size
++    PF add      PF_X, PF_X, #pixblock_size
+     process_pixblock_head
+     cache_preload 0, pixblock_size
+     subs        W, W, #(pixblock_size * 2)
+@@ -468,7 +485,7 @@ fname:
+     pixld       chunk_size, src_bpp, src_basereg, SRC
+     pixld       chunk_size, mask_bpp, mask_basereg, MASK
+     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+-    add         PF_X, PF_X, #chunk_size
++    PF add      PF_X, PF_X, #chunk_size
+ 1:
+ .endif
+ .endr
+--
+cgit v0.8.2
index 8a0ee26..ffca7be 100644 (file)
@@ -4,10 +4,10 @@ DESCRIPTION = "Low-level pixel manipulation library."
 LICENSE = "X11"
 
 PV = "0.17.1"
-PR = "r2"
+PR = "r3"
 PR_append = "+gitr${SRCREV}"
 
-SRCREV = "dc46ad274a47d351bacf3c2167c359d23dbaf8b3"
+SRCREV = "67bf739187cd43b5fff754b25693f76bb788d1fa"
 
 DEFAULT_PREFERENCE = "-1"
 DEFAULT_PREFERENCE_angstrom = "1"
@@ -22,6 +22,8 @@ file://0007-ARM-Enabled-new-NEON-optimizations.patch;patch=1 \
            file://pixman-28986.patch;patch=1 \
            file://nearest-neighbour.patch;patch=1 \
            file://over-8888-0565.patch;patch=1 \
+file://prefetch.patch;patch=1 \
+file://neon-24bpp.patch;patch=1 \
 "
 
 S = "${WORKDIR}/git"