Merge branch 'linux-2.6.27.y' into pandora-27-omap1

[pandora-kernel.git] / fs / ext4 / inode.c
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index aeebfc2..536e392 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1032,19 +1032,20 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
         BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
         mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
  
-       /* Account for allocated meta_blocks */
-       mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
-
-       /* update fs free blocks counter for truncate case */
-       percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
+       if (mdb_free) {
+               /* Account for allocated meta_blocks */
+               mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+               /* update fs dirty blocks counter */
+               percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+               EXT4_I(inode)->i_allocated_meta_blocks = 0;
+               EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+       }
  
         /* update per-inode reservations */
         BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
         EXT4_I(inode)->i_reserved_data_blocks -= used;
  
-       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-       EXT4_I(inode)->i_reserved_meta_blocks = mdb;
-       EXT4_I(inode)->i_allocated_meta_blocks = 0;
         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
  
         /*
@@ -1548,6 +1549,7 @@ static int ext4_journalled_write_end(struct file *file,
  
  static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
  {
+       int retries = 0;
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
         unsigned long md_needed, mdblocks, total = 0;
  
@@ -1556,6 +1558,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
          * in order to allocate nrblocks
          * worse case is one extent per block
          */
+repeat:
         spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
         total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
         mdblocks = ext4_calc_metadata_amount(inode, total);
@@ -1564,13 +1567,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
         md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
         total = md_needed + nrblocks;
  
-       if (ext4_has_free_blocks(sbi, total) < total) {
+       if (ext4_claim_free_blocks(sbi, total)) {
                 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+               if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+                       yield();
+                       goto repeat;
+               }
                 return -ENOSPC;
         }
-       /* reduce fs free blocks counter */
-       percpu_counter_sub(&sbi->s_freeblocks_counter, total);
-
         EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
         EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
  
@@ -1612,8 +1616,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
  
         release = to_free + mdb_free;
  
-       /* update fs free blocks counter for truncate case */
-       percpu_counter_add(&sbi->s_freeblocks_counter, release);
+       /* update fs dirty blocks counter for truncate case */
+       percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
  
         /* update per-inode reservations */
         BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
@@ -1657,6 +1661,7 @@ struct mpage_da_data {
         struct writeback_control *wbc;
         int io_done;
         long pages_written;
+       int retval;
  };
  
  /*
@@ -1694,23 +1699,33 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
  
         pagevec_init(&pvec, 0);
         while (index <= end) {
-               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+               /*
+                * We can use PAGECACHE_TAG_DIRTY lookup here because
+                * even though we have cleared the dirty flag on the page
+                * We still keep the page in the radix tree with tag
+                * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
+                * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
+                * which is called via the below writepage callback.
+                */
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                       PAGECACHE_TAG_DIRTY,
+                                       min(end - index,
+                                       (pgoff_t)PAGEVEC_SIZE-1) + 1);
                 if (nr_pages == 0)
                         break;
                 for (i = 0; i < nr_pages; i++) {
                         struct page *page = pvec.pages[i];
  
-                       index = page->index;
-                       if (index > end)
-                               break;
-                       index++;
-
                         BUG_ON(!PageLocked(page));
                         BUG_ON(PageWriteback(page));
  
                         pages_skipped = mpd->wbc->pages_skipped;
                         err = mapping->a_ops->writepage(page, mpd->wbc);
-                       if (!err)
+                       if (!err && (pages_skipped == mpd->wbc->pages_skipped))
+                               /*
+                                * have successfully written the page
+                                * without skipping the same
+                                */
                                 mpd->pages_written++;
                         /*
                          * In error case, we have to continue because
@@ -1821,6 +1836,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
                 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
  }
  
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+                                       sector_t logical, long blk_cnt)
+{
+       int nr_pages, i;
+       pgoff_t index, end;
+       struct pagevec pvec;
+       struct inode *inode = mpd->inode;
+       struct address_space *mapping = inode->i_mapping;
+
+       index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       end   = (logical + blk_cnt - 1) >>
+                               (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       while (index <= end) {
+               nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+               if (nr_pages == 0)
+                       break;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       index = page->index;
+                       if (index > end)
+                               break;
+                       index++;
+
+                       BUG_ON(!PageLocked(page));
+                       BUG_ON(PageWriteback(page));
+                       block_invalidatepage(page, 0);
+                       ClearPageUptodate(page);
+                       unlock_page(page);
+               }
+       }
+       return;
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       printk(KERN_EMERG "Total free blocks count %lld\n",
+                       ext4_count_free_blocks(inode->i_sb));
+       printk(KERN_EMERG "Free/Dirty block details\n");
+       printk(KERN_EMERG "free_blocks=%lld\n",
+                       percpu_counter_sum(&sbi->s_freeblocks_counter));
+       printk(KERN_EMERG "dirty_blocks=%lld\n",
+                       percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+       printk(KERN_EMERG "Block reservation details\n");
+       printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+                       EXT4_I(inode)->i_reserved_data_blocks);
+       printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+                       EXT4_I(inode)->i_reserved_meta_blocks);
+       return;
+}
+
  /*
   * mpage_da_map_blocks - go through given space
   *
@@ -1830,32 +1896,69 @@ static inline void __unmap_underlying_blocks(struct inode *inode,
   * The function skips space we know is already mapped to disk blocks.
   *
   */
-static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
  {
         int err = 0;
-       struct buffer_head *lbh = &mpd->lbh;
-       sector_t next = lbh->b_blocknr;
         struct buffer_head new;
+       struct buffer_head *lbh = &mpd->lbh;
+       sector_t next;
  
         /*
          * We consider only non-mapped and non-allocated blocks
          */
         if (buffer_mapped(lbh) && !buffer_delay(lbh))
-               return;
-
+               return 0;
         new.b_state = lbh->b_state;
         new.b_blocknr = 0;
         new.b_size = lbh->b_size;
-
+       next = lbh->b_blocknr;
         /*
          * If we didn't accumulate anything
          * to write simply return
          */
         if (!new.b_size)
-               return;
+               return 0;
         err = mpd->get_block(mpd->inode, next, &new, 1);
-       if (err)
-               return;
+       if (err) {
+
+               /* If get block returns with error
+                * we simply return. Later writepage
+                * will redirty the page and writepages
+                * will find the dirty page again
+                */
+               if (err == -EAGAIN)
+                       return 0;
+
+               if (err == -ENOSPC &&
+                               ext4_count_free_blocks(mpd->inode->i_sb)) {
+                       mpd->retval = err;
+                       return 0;
+               }
+
+               /*
+                * get block failure will cause us
+                * to loop in writepages. Because
+                * a_ops->writepage won't be able to
+                * make progress. The page will be redirtied
+                * by writepage and writepages will again
+                * try to write the same.
+                */
+               printk(KERN_EMERG "%s block allocation failed for inode %lu "
+                                 "at logical offset %llu with max blocks "
+                                 "%zd with error %d\n",
+                                 __func__, mpd->inode->i_ino,
+                                 (unsigned long long)next,
+                                 lbh->b_size >> mpd->inode->i_blkbits, err);
+               printk(KERN_EMERG "This should not happen.!! "
+                                       "Data will be lost\n");
+               if (err == -ENOSPC) {
+                       ext4_print_free_blocks(mpd->inode);
+               }
+               /* invlaidate all the pages */
+               ext4_da_block_invalidatepages(mpd, next,
+                               lbh->b_size >> mpd->inode->i_blkbits);
+               return err;
+       }
         BUG_ON(new.b_size == 0);
  
         if (buffer_new(&new))
@@ -1868,7 +1971,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
         if (buffer_delay(lbh) || buffer_unwritten(lbh))
                 mpage_put_bnr_to_bhs(mpd, next, &new);
  
-       return;
+       return 0;
  }
  
  #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
@@ -1937,8 +2040,8 @@ flush_it:
          * We couldn't merge the block to our extent, so we
          * need to flush current  extent and start new one
          */
-       mpage_da_map_blocks(mpd);
-       mpage_da_submit_io(mpd);
+       if (mpage_da_map_blocks(mpd) == 0)
+               mpage_da_submit_io(mpd);
         mpd->io_done = 1;
         return;
  }
@@ -1960,17 +2063,6 @@ static int __mpage_da_writepage(struct page *page,
         struct buffer_head *bh, *head, fake;
         sector_t logical;
  
-       if (mpd->io_done) {
-               /*
-                * Rest of the page in the page_vec
-                * redirty then and skip then. We will
-                * try to to write them again after
-                * starting a new transaction
-                */
-               redirty_page_for_writepage(wbc, page);
-               unlock_page(page);
-               return MPAGE_DA_EXTENT_TAIL;
-       }
         /*
          * Can we merge this page to current extent?
          */
@@ -1980,8 +2072,8 @@ static int __mpage_da_writepage(struct page *page,
                  * and start IO on them using writepage()
                  */
                 if (mpd->next_page != mpd->first_page) {
-                       mpage_da_map_blocks(mpd);
-                       mpage_da_submit_io(mpd);
+                       if (mpage_da_map_blocks(mpd) == 0)
+                               mpage_da_submit_io(mpd);
                         /*
                          * skip rest of the page in the page_vec
                          */
@@ -2060,6 +2152,137 @@ static int __mpage_da_writepage(struct page *page,
         return 0;
  }
  
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *     Range cyclic is ignored.
+ *     no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+                               struct writeback_control *wbc,
+                               struct mpage_da_data *mpd)
+{
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
+       int ret = 0;
+       int done = 0;
+       struct pagevec pvec;
+       int nr_pages;
+       pgoff_t index;
+       pgoff_t end;            /* Inclusive */
+       long nr_to_write = wbc->nr_to_write;
+
+       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+               wbc->encountered_congestion = 1;
+               return 0;
+       }
+
+       pagevec_init(&pvec, 0);
+       index = wbc->range_start >> PAGE_CACHE_SHIFT;
+       end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+       while (!done && (index <= end)) {
+               int i;
+
+               nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                             PAGECACHE_TAG_DIRTY,
+                             min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+               if (nr_pages == 0)
+                       break;
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       /*
+                        * At this point, the page may be truncated or
+                        * invalidated (changing page->mapping to NULL), or
+                        * even swizzled back from swapper_space to tmpfs file
+                        * mapping. However, page->index will not change
+                        * because we have a reference on the page.
+                        */
+                       if (page->index > end) {
+                               done = 1;
+                               break;
+                       }
+
+                       lock_page(page);
+
+                       /*
+                        * Page truncated or invalidated. We can freely skip it
+                        * then, even for data integrity operations: the page
+                        * has disappeared concurrently, so there could be no
+                        * real expectation of this data interity operation
+                        * even if there is now a new, dirty page at the same
+                        * pagecache address.
+                        */
+                       if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+                               unlock_page(page);
+                               continue;
+                       }
+
+                       if (!PageDirty(page)) {
+                               /* someone wrote it for us */
+                               goto continue_unlock;
+                       }
+
+                       if (PageWriteback(page)) {
+                               if (wbc->sync_mode != WB_SYNC_NONE)
+                                       wait_on_page_writeback(page);
+                               else
+                                       goto continue_unlock;
+                       }
+
+                       BUG_ON(PageWriteback(page));
+                       if (!clear_page_dirty_for_io(page))
+                               goto continue_unlock;
+
+                       ret = __mpage_da_writepage(page, wbc, mpd);
+
+                       if (unlikely(ret)) {
+                               if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                       unlock_page(page);
+                                       ret = 0;
+                               } else {
+                                       done = 1;
+                                       break;
+                               }
+                       }
+
+                       if (nr_to_write > 0) {
+                               nr_to_write--;
+                               if (nr_to_write == 0 &&
+                                   wbc->sync_mode == WB_SYNC_NONE) {
+                                       /*
+                                        * We stop writing back only if we are
+                                        * not doing integrity sync. In case of
+                                        * integrity sync we have to keep going
+                                        * because someone may be concurrently
+                                        * dirtying pages, and we might have
+                                        * synced a lot of newly appeared dirty
+                                        * pages, but have not synced all of the
+                                        * old dirty pages.
+                                        */
+                                       done = 1;
+                                       break;
+                               }
+                       }
+
+                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                               wbc->encountered_congestion = 1;
+                               done = 1;
+                               break;
+                       }
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+       return ret;
+}
+
+
  /*
   * mpage_da_writepages - walk the list of dirty pages of the given
   * address space, allocates non-allocated blocks, maps newly-allocated
@@ -2074,39 +2297,34 @@ static int __mpage_da_writepage(struct page *page,
   */
  static int mpage_da_writepages(struct address_space *mapping,
                                struct writeback_control *wbc,
-                              get_block_t get_block)
+                              struct mpage_da_data *mpd)
  {
-       struct mpage_da_data mpd;
-       long to_write;
         int ret;
  
-       if (!get_block)
+       if (!mpd->get_block)
                 return generic_writepages(mapping, wbc);
  
-       mpd.wbc = wbc;
-       mpd.inode = mapping->host;
-       mpd.lbh.b_size = 0;
-       mpd.lbh.b_state = 0;
-       mpd.lbh.b_blocknr = 0;
-       mpd.first_page = 0;
-       mpd.next_page = 0;
-       mpd.get_block = get_block;
-       mpd.io_done = 0;
-       mpd.pages_written = 0;
-
-       to_write = wbc->nr_to_write;
-
-       ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+       mpd->lbh.b_size = 0;
+       mpd->lbh.b_state = 0;
+       mpd->lbh.b_blocknr = 0;
+       mpd->first_page = 0;
+       mpd->next_page = 0;
+       mpd->io_done = 0;
+       mpd->pages_written = 0;
+       mpd->retval = 0;
  
+       ret = write_cache_pages_da(mapping, wbc, mpd);
         /*
          * Handle last extent of pages
          */
-       if (!mpd.io_done && mpd.next_page != mpd.first_page) {
-               mpage_da_map_blocks(&mpd);
-               mpage_da_submit_io(&mpd);
-       }
+       if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+               if (mpage_da_map_blocks(mpd) == 0)
+                       mpage_da_submit_io(mpd);
  
-       wbc->nr_to_write = to_write - mpd.pages_written;
+               mpd->io_done = 1;
+               ret = MPAGE_DA_EXTENT_TAIL;
+       }
+       wbc->nr_to_write -= mpd->pages_written;
         return ret;
  }
  
@@ -2351,11 +2569,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
  static int ext4_da_writepages(struct address_space *mapping,
                               struct writeback_control *wbc)
  {
+       pgoff_t index;
+       int range_whole = 0;
         handle_t *handle = NULL;
-       loff_t range_start = 0;
+       struct mpage_da_data mpd;
         struct inode *inode = mapping->host;
+       long pages_written = 0, pages_skipped;
+       int range_cyclic, cycled = 1, io_done = 0;
         int needed_blocks, ret = 0, nr_to_writebump = 0;
-       long to_write, pages_skipped = 0;
         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
  
         /*
@@ -2389,20 +2610,27 @@ static int ext4_da_writepages(struct address_space *mapping,
                 nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
                 wbc->nr_to_write = sbi->s_mb_stream_request;
         }
+       if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+               range_whole = 1;
+
+       range_cyclic = wbc->range_cyclic;
+       if (wbc->range_cyclic) {
+               index = mapping->writeback_index;
+               if (index)
+                       cycled = 0;
+               wbc->range_start = index << PAGE_CACHE_SHIFT;
+               wbc->range_end  = LLONG_MAX;
+               wbc->range_cyclic = 0;
+       } else
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
  
-       if (!wbc->range_cyclic)
-               /*
-                * If range_cyclic is not set force range_cont
-                * and save the old writeback_index
-                */
-               wbc->range_cont = 1;
+       mpd.wbc = wbc;
+       mpd.inode = mapping->host;
  
-       range_start =  wbc->range_start;
         pages_skipped = wbc->pages_skipped;
  
-restart_loop:
-       to_write = wbc->nr_to_write;
-       while (!ret && to_write > 0) {
+retry:
+       while (!ret && wbc->nr_to_write > 0) {
  
                 /*
                  * we  insert one extent at a time. So we need
@@ -2435,42 +2663,60 @@ restart_loop:
                                 goto out_writepages;
                         }
                 }
+               mpd.get_block = ext4_da_get_block_write;
+               ret = mpage_da_writepages(mapping, wbc, &mpd);
  
-               to_write -= wbc->nr_to_write;
-               ret = mpage_da_writepages(mapping, wbc,
-                                         ext4_da_get_block_write);
                 ext4_journal_stop(handle);
-               if (ret == MPAGE_DA_EXTENT_TAIL) {
+
+               if (mpd.retval == -ENOSPC) {
+                       /* commit the transaction which would
+                        * free blocks released in the transaction
+                        * and try again
+                        */
+                       jbd2_journal_force_commit_nested(sbi->s_journal);
+                       wbc->pages_skipped = pages_skipped;
+                       ret = 0;
+               } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                         /*
                          * got one extent now try with
                          * rest of the pages
                          */
-                       to_write += wbc->nr_to_write;
+                       pages_written += mpd.pages_written;
+                       wbc->pages_skipped = pages_skipped;
                         ret = 0;
-               } else if (wbc->nr_to_write) {
+                       io_done = 1;
+               } else if (wbc->nr_to_write)
                         /*
                          * There is no more writeout needed
                          * or we requested for a noblocking writeout
                          * and we found the device congested
                          */
-                       to_write += wbc->nr_to_write;
                         break;
-               }
-               wbc->nr_to_write = to_write;
         }
-
-       if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
-               /* We skipped pages in this loop */
-               wbc->range_start = range_start;
-               wbc->nr_to_write = to_write +
-                               wbc->pages_skipped - pages_skipped;
-               wbc->pages_skipped = pages_skipped;
-               goto restart_loop;
+       if (!io_done && !cycled) {
+               cycled = 1;
+               index = 0;
+               wbc->range_start = index << PAGE_CACHE_SHIFT;
+               wbc->range_end  = mapping->writeback_index - 1;
+               goto retry;
         }
+       if (pages_skipped != wbc->pages_skipped)
+               printk(KERN_EMERG "This should not happen leaving %s "
+                               "with nr_to_write = %ld ret = %d\n",
+                               __func__, wbc->nr_to_write, ret);
+
+       /* Update index */
+       index += pages_written;
+       wbc->range_cyclic = range_cyclic;
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+               /*
+                * set the writeback_index so that range_cyclic
+                * mode will write it back later
+                */
+               mapping->writeback_index = index;
  
  out_writepages:
-       wbc->nr_to_write = to_write - nr_to_writebump;
-       wbc->range_start = range_start;
+       wbc->nr_to_write -= nr_to_writebump;
         return ret;
  }
  
@@ -2488,7 +2734,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
         index = pos >> PAGE_CACHE_SHIFT;
         from = pos & (PAGE_CACHE_SIZE - 1);
         to = from + len;
-
  retry:
         /*
          * With delayed allocation, we don't log the i_disksize update