Btrfs: Optimize compressed writeback and reads
[pandora-kernel.git] / fs / btrfs / zlib.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  *
18  * Based on jffs2 zlib code:
19  * Copyright © 2001-2007 Red Hat, Inc.
20  * Created by David Woodhouse <dwmw2@infradead.org>
21  */
22
23 #include <linux/kernel.h>
24 #include <linux/slab.h>
25 #include <linux/zlib.h>
26 #include <linux/zutil.h>
27 #include <linux/vmalloc.h>
28 #include <linux/init.h>
29 #include <linux/err.h>
30 #include <linux/sched.h>
31 #include <linux/pagemap.h>
32 #include <linux/bio.h>
33
34 /* Plan: call deflate() with avail_in == *sourcelen,
35         avail_out = *dstlen - 12 and flush == Z_FINISH.
36         If it doesn't manage to finish, call it again with
37         avail_in == 0 and avail_out set to the remaining 12
38         bytes for it to clean up.
39    Q: Is 12 bytes sufficient?
40 */
41 #define STREAM_END_SPACE 12
42
43 struct workspace {
44         z_stream inf_strm;
45         z_stream def_strm;
46         char *buf;
47         struct list_head list;
48 };
49
50 static LIST_HEAD(idle_workspace);
51 static DEFINE_SPINLOCK(workspace_lock);
52 static unsigned long num_workspace;
53 static atomic_t alloc_workspace = ATOMIC_INIT(0);
54 static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
55
56 /*
57  * this finds an available zlib workspace or allocates a new one
58  * NULL or an ERR_PTR is returned if things go bad.
59  */
60 static struct workspace *find_zlib_workspace(void)
61 {
62         struct workspace *workspace;
63         int ret;
64         int cpus = num_online_cpus();
65
66 again:
67         spin_lock(&workspace_lock);
68         if (!list_empty(&idle_workspace)) {
69                 workspace = list_entry(idle_workspace.next, struct workspace,
70                                        list);
71                 list_del(&workspace->list);
72                 num_workspace--;
73                 spin_unlock(&workspace_lock);
74                 return workspace;
75
76         }
77         spin_unlock(&workspace_lock);
78         if (atomic_read(&alloc_workspace) > cpus) {
79                 DEFINE_WAIT(wait);
80                 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
81                 if (atomic_read(&alloc_workspace) > cpus)
82                         schedule();
83                 finish_wait(&workspace_wait, &wait);
84                 goto again;
85         }
86         atomic_inc(&alloc_workspace);
87         workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
88         if (!workspace) {
89                 ret = -ENOMEM;
90                 goto fail;
91         }
92
93         workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
94         if (!workspace->def_strm.workspace) {
95                 ret = -ENOMEM;
96                 goto fail;
97         }
98         workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
99         if (!workspace->inf_strm.workspace) {
100                 ret = -ENOMEM;
101                 goto fail_inflate;
102         }
103         workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
104         if (!workspace->buf) {
105                 ret = -ENOMEM;
106                 goto fail_kmalloc;
107         }
108         return workspace;
109
110 fail_kmalloc:
111         vfree(workspace->inf_strm.workspace);
112 fail_inflate:
113         vfree(workspace->def_strm.workspace);
114 fail:
115         kfree(workspace);
116         atomic_dec(&alloc_workspace);
117         wake_up(&workspace_wait);
118         return ERR_PTR(ret);
119 }
120
121 /*
122  * put a workspace struct back on the list or free it if we have enough
123  * idle ones sitting around
124  */
125 static int free_workspace(struct workspace *workspace)
126 {
127         spin_lock(&workspace_lock);
128         if (num_workspace < num_online_cpus()) {
129                 list_add_tail(&workspace->list, &idle_workspace);
130                 num_workspace++;
131                 spin_unlock(&workspace_lock);
132                 if (waitqueue_active(&workspace_wait))
133                         wake_up(&workspace_wait);
134                 return 0;
135         }
136         spin_unlock(&workspace_lock);
137         vfree(workspace->def_strm.workspace);
138         vfree(workspace->inf_strm.workspace);
139         kfree(workspace->buf);
140         kfree(workspace);
141
142         atomic_dec(&alloc_workspace);
143         if (waitqueue_active(&workspace_wait))
144                 wake_up(&workspace_wait);
145         return 0;
146 }
147
148 /*
149  * cleanup function for module exit
150  */
151 static void free_workspaces(void)
152 {
153         struct workspace *workspace;
154         while(!list_empty(&idle_workspace)) {
155                 workspace = list_entry(idle_workspace.next, struct workspace,
156                                        list);
157                 list_del(&workspace->list);
158                 vfree(workspace->def_strm.workspace);
159                 vfree(workspace->inf_strm.workspace);
160                 kfree(workspace->buf);
161                 kfree(workspace);
162                 atomic_dec(&alloc_workspace);
163         }
164 }
165
166 /*
167  * given an address space and start/len, compress the bytes.
168  *
169  * pages are allocated to hold the compressed result and stored
170  * in 'pages'
171  *
172  * out_pages is used to return the number of pages allocated.  There
173  * may be pages allocated even if we return an error
174  *
175  * total_in is used to return the number of bytes actually read.  It
176  * may be smaller then len if we had to exit early because we
177  * ran out of room in the pages array or because we cross the
178  * max_out threshold.
179  *
180  * total_out is used to return the total number of compressed bytes
181  *
182  * max_out tells us the max number of bytes that we're allowed to
183  * stuff into pages
184  */
185 int btrfs_zlib_compress_pages(struct address_space *mapping,
186                               u64 start, unsigned long len,
187                               struct page **pages,
188                               unsigned long nr_dest_pages,
189                               unsigned long *out_pages,
190                               unsigned long *total_in,
191                               unsigned long *total_out,
192                               unsigned long max_out)
193 {
194         int ret;
195         struct workspace *workspace;
196         char *data_in;
197         char *cpage_out;
198         int nr_pages = 0;
199         struct page *in_page = NULL;
200         struct page *out_page = NULL;
201         int out_written = 0;
202         int in_read = 0;
203         unsigned long bytes_left;
204
205         *out_pages = 0;
206         *total_out = 0;
207         *total_in = 0;
208
209         workspace = find_zlib_workspace();
210         if (!workspace)
211                 return -1;
212
213         if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
214                 printk(KERN_WARNING "deflateInit failed\n");
215                 ret = -1;
216                 goto out;
217         }
218
219         workspace->def_strm.total_in = 0;
220         workspace->def_strm.total_out = 0;
221
222         in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
223         data_in = kmap(in_page);
224
225         out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
226         cpage_out = kmap(out_page);
227         pages[0] = out_page;
228         nr_pages = 1;
229
230         workspace->def_strm.next_in = data_in;
231         workspace->def_strm.next_out = cpage_out;
232         workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
233         workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
234
235         out_written = 0;
236         in_read = 0;
237
238         while (workspace->def_strm.total_in < len) {
239                 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
240                 if (ret != Z_OK) {
241                         printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
242                                ret);
243                         zlib_deflateEnd(&workspace->def_strm);
244                         ret = -1;
245                         goto out;
246                 }
247
248                 /* we're making it bigger, give up */
249                 if (workspace->def_strm.total_in > 8192 &&
250                     workspace->def_strm.total_in <
251                     workspace->def_strm.total_out) {
252                         ret = -1;
253                         goto out;
254                 }
255                 /* we need another page for writing out.  Test this
256                  * before the total_in so we will pull in a new page for
257                  * the stream end if required
258                  */
259                 if (workspace->def_strm.avail_out == 0) {
260                         kunmap(out_page);
261                         if (nr_pages == nr_dest_pages) {
262                                 out_page = NULL;
263                                 ret = -1;
264                                 goto out;
265                         }
266                         out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
267                         cpage_out = kmap(out_page);
268                         pages[nr_pages] = out_page;
269                         nr_pages++;
270                         workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
271                         workspace->def_strm.next_out = cpage_out;
272                 }
273                 /* we're all done */
274                 if (workspace->def_strm.total_in >= len)
275                         break;
276
277                 /* we've read in a full page, get a new one */
278                 if (workspace->def_strm.avail_in == 0) {
279                         if (workspace->def_strm.total_out > max_out)
280                                 break;
281
282                         bytes_left = len - workspace->def_strm.total_in;
283                         kunmap(in_page);
284                         page_cache_release(in_page);
285
286                         start += PAGE_CACHE_SIZE;
287                         in_page = find_get_page(mapping,
288                                                 start >> PAGE_CACHE_SHIFT);
289                         data_in = kmap(in_page);
290                         workspace->def_strm.avail_in = min(bytes_left,
291                                                            PAGE_CACHE_SIZE);
292                         workspace->def_strm.next_in = data_in;
293                 }
294         }
295         workspace->def_strm.avail_in = 0;
296         ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
297         zlib_deflateEnd(&workspace->def_strm);
298
299         if (ret != Z_STREAM_END) {
300                 ret = -1;
301                 goto out;
302         }
303
304         if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
305                 ret = -1;
306                 goto out;
307         }
308
309         ret = 0;
310         *total_out = workspace->def_strm.total_out;
311         *total_in = workspace->def_strm.total_in;
312 out:
313         *out_pages = nr_pages;
314         if (out_page)
315                 kunmap(out_page);
316
317         if (in_page) {
318                 kunmap(in_page);
319                 page_cache_release(in_page);
320         }
321         free_workspace(workspace);
322         return ret;
323 }
324
325 /*
326  * pages_in is an array of pages with compressed data.
327  *
328  * disk_start is the starting logical offset of this array in the file
329  *
330  * bvec is a bio_vec of pages from the file that we want to decompress into
331  *
332  * vcnt is the count of pages in the biovec
333  *
334  * srclen is the number of bytes in pages_in
335  *
336  * The basic idea is that we have a bio that was created by readpages.
337  * The pages in the bio are for the uncompressed data, and they may not
338  * be contiguous.  They all correspond to the range of bytes covered by
339  * the compressed extent.
340  */
341 int btrfs_zlib_decompress_biovec(struct page **pages_in,
342                               u64 disk_start,
343                               struct bio_vec *bvec,
344                               int vcnt,
345                               size_t srclen)
346 {
347         int ret = 0;
348         int wbits = MAX_WBITS;
349         struct workspace *workspace;
350         char *data_in;
351         size_t total_out = 0;
352         unsigned long page_bytes_left;
353         unsigned long page_in_index = 0;
354         unsigned long page_out_index = 0;
355         struct page *page_out;
356         unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
357                                         PAGE_CACHE_SIZE;
358         unsigned long buf_start;
359         unsigned long buf_offset;
360         unsigned long bytes;
361         unsigned long working_bytes;
362         unsigned long pg_offset;
363         unsigned long start_byte;
364         unsigned long current_buf_start;
365         char *kaddr;
366
367         workspace = find_zlib_workspace();
368         if (!workspace)
369                 return -ENOMEM;
370
371         data_in = kmap(pages_in[page_in_index]);
372         workspace->inf_strm.next_in = data_in;
373         workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
374         workspace->inf_strm.total_in = 0;
375
376         workspace->inf_strm.total_out = 0;
377         workspace->inf_strm.next_out = workspace->buf;
378         workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
379         page_out = bvec[page_out_index].bv_page;
380         page_bytes_left = PAGE_CACHE_SIZE;
381         pg_offset = 0;
382
383         /* If it's deflate, and it's got no preset dictionary, then
384            we can tell zlib to skip the adler32 check. */
385         if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
386             ((data_in[0] & 0x0f) == Z_DEFLATED) &&
387             !(((data_in[0]<<8) + data_in[1]) % 31)) {
388
389                 wbits = -((data_in[0] >> 4) + 8);
390                 workspace->inf_strm.next_in += 2;
391                 workspace->inf_strm.avail_in -= 2;
392         }
393
394         if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
395                 printk(KERN_WARNING "inflateInit failed\n");
396                 ret = -1;
397                 goto out;
398         }
399         while(workspace->inf_strm.total_in < srclen) {
400                 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
401                 if (ret != Z_OK && ret != Z_STREAM_END) {
402                         break;
403                 }
404
405                 /*
406                  * buf start is the byte offset we're of the start of
407                  * our workspace buffer
408                  */
409                 buf_start = total_out;
410
411                 /* total_out is the last byte of the workspace buffer */
412                 total_out = workspace->inf_strm.total_out;
413
414                 working_bytes = total_out - buf_start;
415
416                 /*
417                  * start byte is the first byte of the page we're currently
418                  * copying into relative to the start of the compressed data.
419                  */
420                 start_byte = page_offset(page_out) - disk_start;
421
422                 if (working_bytes == 0) {
423                         /* we didn't make progress in this inflate
424                          * call, we're done
425                          */
426                         if (ret != Z_STREAM_END) {
427                                 ret = -1;
428                         }
429                         break;
430                 }
431
432                 /* we haven't yet hit data corresponding to this page */
433                 if (total_out <= start_byte) {
434                         goto next;
435                 }
436
437                 /*
438                  * the start of the data we care about is offset into
439                  * the middle of our working buffer
440                  */
441                 if (total_out > start_byte && buf_start < start_byte) {
442                         buf_offset = start_byte - buf_start;
443                         working_bytes -= buf_offset;
444                 } else {
445                         buf_offset = 0;
446                 }
447                 current_buf_start = buf_start;
448
449                 /* copy bytes from the working buffer into the pages */
450                 while(working_bytes > 0) {
451                         bytes = min(PAGE_CACHE_SIZE - pg_offset,
452                                     PAGE_CACHE_SIZE - buf_offset);
453                         bytes = min(bytes, working_bytes);
454                         kaddr = kmap_atomic(page_out, KM_USER0);
455                         memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
456                                bytes);
457                         kunmap_atomic(kaddr, KM_USER0);
458                         flush_dcache_page(page_out);
459
460                         pg_offset += bytes;
461                         page_bytes_left -= bytes;
462                         buf_offset += bytes;
463                         working_bytes -= bytes;
464                         current_buf_start += bytes;
465
466                         /* check if we need to pick another page */
467                         if (page_bytes_left == 0) {
468                                 page_out_index++;
469                                 if (page_out_index >= vcnt) {
470                                         ret = 0;
471                                         goto done;
472                                 }
473                                 page_out = bvec[page_out_index].bv_page;
474                                 pg_offset = 0;
475                                 page_bytes_left = PAGE_CACHE_SIZE;
476                                 start_byte = page_offset(page_out) - disk_start;
477
478                                 /*
479                                  * make sure our new page is covered by this
480                                  * working buffer
481                                  */
482                                 if (total_out <= start_byte) {
483                                         goto next;
484                                 }
485
486                                 /* the next page in the biovec might not
487                                  * be adjacent to the last page, but it
488                                  * might still be found inside this working
489                                  * buffer.  bump our offset pointer
490                                  */
491                                 if (total_out > start_byte &&
492                                     current_buf_start < start_byte) {
493                                         buf_offset = start_byte - buf_start;
494                                         working_bytes = total_out - start_byte;
495                                         current_buf_start = buf_start +
496                                                 buf_offset;
497                                 }
498                         }
499                 }
500 next:
501                 workspace->inf_strm.next_out = workspace->buf;
502                 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
503
504                 if (workspace->inf_strm.avail_in == 0) {
505                         unsigned long tmp;
506                         kunmap(pages_in[page_in_index]);
507                         page_in_index++;
508                         if (page_in_index >= total_pages_in) {
509                                 data_in = NULL;
510                                 break;
511                         }
512                         data_in = kmap(pages_in[page_in_index]);
513                         workspace->inf_strm.next_in = data_in;
514                         tmp = srclen - workspace->inf_strm.total_in;
515                         workspace->inf_strm.avail_in = min(tmp,
516                                                            PAGE_CACHE_SIZE);
517                 }
518         }
519         if (ret != Z_STREAM_END) {
520                 ret = -1;
521         } else {
522                 ret = 0;
523         }
524 done:
525         zlib_inflateEnd(&workspace->inf_strm);
526         if (data_in)
527                 kunmap(pages_in[page_in_index]);
528 out:
529         free_workspace(workspace);
530         return ret;
531 }
532
533 /*
534  * a less complex decompression routine.  Our compressed data fits in a
535  * single page, and we want to read a single page out of it.
536  * start_byte tells us the offset into the compressed data we're interested in
537  */
538 int btrfs_zlib_decompress(unsigned char *data_in,
539                           struct page *dest_page,
540                           unsigned long start_byte,
541                           size_t srclen, size_t destlen)
542 {
543         int ret = 0;
544         int wbits = MAX_WBITS;
545         struct workspace *workspace;
546         unsigned long bytes_left = destlen;
547         unsigned long total_out = 0;
548         char *kaddr;
549
550         if (destlen > PAGE_CACHE_SIZE)
551                 return -ENOMEM;
552
553         workspace = find_zlib_workspace();
554         if (!workspace)
555                 return -ENOMEM;
556
557         workspace->inf_strm.next_in = data_in;
558         workspace->inf_strm.avail_in = srclen;
559         workspace->inf_strm.total_in = 0;
560
561         workspace->inf_strm.next_out = workspace->buf;
562         workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
563         workspace->inf_strm.total_out = 0;
564         /* If it's deflate, and it's got no preset dictionary, then
565            we can tell zlib to skip the adler32 check. */
566         if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
567             ((data_in[0] & 0x0f) == Z_DEFLATED) &&
568             !(((data_in[0]<<8) + data_in[1]) % 31)) {
569
570                 wbits = -((data_in[0] >> 4) + 8);
571                 workspace->inf_strm.next_in += 2;
572                 workspace->inf_strm.avail_in -= 2;
573         }
574
575         if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
576                 printk(KERN_WARNING "inflateInit failed\n");
577                 ret = -1;
578                 goto out;
579         }
580
581         while(bytes_left > 0) {
582                 unsigned long buf_start;
583                 unsigned long buf_offset;
584                 unsigned long bytes;
585                 unsigned long pg_offset = 0;
586
587                 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
588                 if (ret != Z_OK && ret != Z_STREAM_END) {
589                         break;
590                 }
591
592                 buf_start = total_out;
593                 total_out = workspace->inf_strm.total_out;
594
595                 if (total_out == buf_start) {
596                         ret = -1;
597                         break;
598                 }
599
600                 if (total_out <= start_byte) {
601                         goto next;
602                 }
603
604                 if (total_out > start_byte && buf_start < start_byte) {
605                         buf_offset = start_byte - buf_start;
606                 } else {
607                         buf_offset = 0;
608                 }
609
610                 bytes = min(PAGE_CACHE_SIZE - pg_offset,
611                             PAGE_CACHE_SIZE - buf_offset);
612                 bytes = min(bytes, bytes_left);
613
614                 kaddr = kmap_atomic(dest_page, KM_USER0);
615                 memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
616                 kunmap_atomic(kaddr, KM_USER0);
617
618                 pg_offset += bytes;
619                 bytes_left -= bytes;
620 next:
621                 workspace->inf_strm.next_out = workspace->buf;
622                 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
623         }
624         if (ret != Z_STREAM_END && bytes_left != 0) {
625                 ret = -1;
626         } else {
627                 ret = 0;
628         }
629         zlib_inflateEnd(&workspace->inf_strm);
630 out:
631         free_workspace(workspace);
632         return ret;
633 }
634
635 void btrfs_zlib_exit(void)
636 {
637     free_workspaces();
638 }