Merge branch 'devel' of master.kernel.org:/home/rmk/linux-2.6-serial
[pandora-kernel.git] / fs / reiserfs / file.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 #include <linux/config.h>
6 #include <linux/time.h>
7 #include <linux/reiserfs_fs.h>
8 #include <linux/reiserfs_acl.h>
9 #include <linux/reiserfs_xattr.h>
10 #include <linux/smp_lock.h>
11 #include <asm/uaccess.h>
12 #include <linux/pagemap.h>
13 #include <linux/swap.h>
14 #include <linux/writeback.h>
15 #include <linux/blkdev.h>
16 #include <linux/buffer_head.h>
17 #include <linux/quotaops.h>
18
19 /*
20 ** We pack the tails of files on file close, not at the time they are written.
21 ** This implies an unnecessary copy of the tail and an unnecessary indirect item
22 ** insertion/balancing, for files that are written in one write.
23 ** It avoids unnecessary tail packings (balances) for files that are written in
24 ** multiple writes and are small enough to have tails.
25 ** 
26 ** file_release is called by the VFS layer when the file is closed.  If
27 ** this is the last open file descriptor, and the file
28 ** small enough to have a tail, and the tail is currently in an
29 ** unformatted node, the tail is converted back into a direct item.
30 ** 
31 ** We use reiserfs_truncate_file to pack the tail, since it already has
32 ** all the conditions coded.  
33 */
34 static int reiserfs_file_release(struct inode *inode, struct file *filp)
35 {
36
37         struct reiserfs_transaction_handle th;
38         int err;
39         int jbegin_failure = 0;
40
41         if (!S_ISREG(inode->i_mode))
42                 BUG();
43
44         /* fast out for when nothing needs to be done */
45         if ((atomic_read(&inode->i_count) > 1 ||
46              !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
47              !tail_has_to_be_packed(inode)) &&
48             REISERFS_I(inode)->i_prealloc_count <= 0) {
49                 return 0;
50         }
51
52         mutex_lock(&inode->i_mutex);
53         reiserfs_write_lock(inode->i_sb);
54         /* freeing preallocation only involves relogging blocks that
55          * are already in the current transaction.  preallocation gets
56          * freed at the end of each transaction, so it is impossible for
57          * us to log any additional blocks (including quota blocks)
58          */
59         err = journal_begin(&th, inode->i_sb, 1);
60         if (err) {
61                 /* uh oh, we can't allow the inode to go away while there
62                  * is still preallocation blocks pending.  Try to join the
63                  * aborted transaction
64                  */
65                 jbegin_failure = err;
66                 err = journal_join_abort(&th, inode->i_sb, 1);
67
68                 if (err) {
69                         /* hmpf, our choices here aren't good.  We can pin the inode
70                          * which will disallow unmount from every happening, we can
71                          * do nothing, which will corrupt random memory on unmount,
72                          * or we can forcibly remove the file from the preallocation
73                          * list, which will leak blocks on disk.  Lets pin the inode
74                          * and let the admin know what is going on.
75                          */
76                         igrab(inode);
77                         reiserfs_warning(inode->i_sb,
78                                          "pinning inode %lu because the "
79                                          "preallocation can't be freed");
80                         goto out;
81                 }
82         }
83         reiserfs_update_inode_transaction(inode);
84
85 #ifdef REISERFS_PREALLOCATE
86         reiserfs_discard_prealloc(&th, inode);
87 #endif
88         err = journal_end(&th, inode->i_sb, 1);
89
90         /* copy back the error code from journal_begin */
91         if (!err)
92                 err = jbegin_failure;
93
94         if (!err && atomic_read(&inode->i_count) <= 1 &&
95             (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
96             tail_has_to_be_packed(inode)) {
97                 /* if regular file is released by last holder and it has been
98                    appended (we append by unformatted node only) or its direct
99                    item(s) had to be converted, then it may have to be
100                    indirect2direct converted */
101                 err = reiserfs_truncate_file(inode, 0);
102         }
103       out:
104         mutex_unlock(&inode->i_mutex);
105         reiserfs_write_unlock(inode->i_sb);
106         return err;
107 }
108
109 static void reiserfs_vfs_truncate_file(struct inode *inode)
110 {
111         reiserfs_truncate_file(inode, 1);
112 }
113
114 /* Sync a reiserfs file. */
115
116 /*
117  * FIXME: sync_mapping_buffers() never has anything to sync.  Can
118  * be removed...
119  */
120
121 static int reiserfs_sync_file(struct file *p_s_filp,
122                               struct dentry *p_s_dentry, int datasync)
123 {
124         struct inode *p_s_inode = p_s_dentry->d_inode;
125         int n_err;
126         int barrier_done;
127
128         if (!S_ISREG(p_s_inode->i_mode))
129                 BUG();
130         n_err = sync_mapping_buffers(p_s_inode->i_mapping);
131         reiserfs_write_lock(p_s_inode->i_sb);
132         barrier_done = reiserfs_commit_for_inode(p_s_inode);
133         reiserfs_write_unlock(p_s_inode->i_sb);
134         if (barrier_done != 1 && reiserfs_barrier_flush(p_s_inode->i_sb))
135                 blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
136         if (barrier_done < 0)
137                 return barrier_done;
138         return (n_err < 0) ? -EIO : 0;
139 }
140
141 /* I really do not want to play with memory shortage right now, so
142    to simplify the code, we are not going to write more than this much pages at
143    a time. This still should considerably improve performance compared to 4k
144    at a time case. This is 32 pages of 4k size. */
145 #define REISERFS_WRITE_PAGES_AT_A_TIME (128 * 1024) / PAGE_CACHE_SIZE
146
147 /* Allocates blocks for a file to fulfil write request.
148    Maps all unmapped but prepared pages from the list.
149    Updates metadata with newly allocated blocknumbers as needed */
150 static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,     /* Inode we work with */
151                                                loff_t pos,      /* Writing position */
152                                                int num_pages,   /* number of pages write going
153                                                                    to touch */
154                                                int write_bytes, /* amount of bytes to write */
155                                                struct page **prepared_pages,    /* array of
156                                                                                    prepared pages
157                                                                                  */
158                                                int blocks_to_allocate   /* Amount of blocks we
159                                                                            need to allocate to
160                                                                            fit the data into file
161                                                                          */
162     )
163 {
164         struct cpu_key key;     // cpu key of item that we are going to deal with
165         struct item_head *ih;   // pointer to item head that we are going to deal with
166         struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
167         __le32 *item;           // pointer to item we are going to deal with
168         INITIALIZE_PATH(path);  // path to item, that we are going to deal with.
169         b_blocknr_t *allocated_blocks;  // Pointer to a place where allocated blocknumbers would be stored.
170         reiserfs_blocknr_hint_t hint;   // hint structure for block allocator.
171         size_t res;             // return value of various functions that we call.
172         int curr_block;         // current block used to keep track of unmapped blocks.
173         int i;                  // loop counter
174         int itempos;            // position in item
175         unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));      // writing position in
176         // first page
177         unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;        /* last modified byte offset in last page */
178         __u64 hole_size;        // amount of blocks for a file hole, if it needed to be created.
179         int modifying_this_item = 0;    // Flag for items traversal code to keep track
180         // of the fact that we already prepared
181         // current block for journal
182         int will_prealloc = 0;
183         RFALSE(!blocks_to_allocate,
184                "green-9004: tried to allocate zero blocks?");
185
186         /* only preallocate if this is a small write */
187         if (REISERFS_I(inode)->i_prealloc_count ||
188             (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
189              blocks_to_allocate <
190              REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
191                 will_prealloc =
192                     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
193
194         allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
195                                    sizeof(b_blocknr_t), GFP_NOFS);
196         if (!allocated_blocks)
197                 return -ENOMEM;
198
199         /* First we compose a key to point at the writing position, we want to do
200            that outside of any locking region. */
201         make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
202
203         /* If we came here, it means we absolutely need to open a transaction,
204            since we need to allocate some blocks */
205         reiserfs_write_lock(inode->i_sb);       // Journaling stuff and we need that.
206         res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));   // Wish I know if this number enough
207         if (res)
208                 goto error_exit;
209         reiserfs_update_inode_transaction(inode);
210
211         /* Look for the in-tree position of our write, need path for block allocator */
212         res = search_for_position_by_key(inode->i_sb, &key, &path);
213         if (res == IO_ERROR) {
214                 res = -EIO;
215                 goto error_exit;
216         }
217
218         /* Allocate blocks */
219         /* First fill in "hint" structure for block allocator */
220         hint.th = th;           // transaction handle.
221         hint.path = &path;      // Path, so that block allocator can determine packing locality or whatever it needs to determine.
222         hint.inode = inode;     // Inode is needed by block allocator too.
223         hint.search_start = 0;  // We have no hint on where to search free blocks for block allocator.
224         hint.key = key.on_disk_key;     // on disk key of file.
225         hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);    // Number of disk blocks this file occupies already.
226         hint.formatted_node = 0;        // We are allocating blocks for unformatted node.
227         hint.preallocate = will_prealloc;
228
229         /* Call block allocator to allocate blocks */
230         res =
231             reiserfs_allocate_blocknrs(&hint, allocated_blocks,
232                                        blocks_to_allocate, blocks_to_allocate);
233         if (res != CARRY_ON) {
234                 if (res == NO_DISK_SPACE) {
235                         /* We flush the transaction in case of no space. This way some
236                            blocks might become free */
237                         SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
238                         res = restart_transaction(th, inode, &path);
239                         if (res)
240                                 goto error_exit;
241
242                         /* We might have scheduled, so search again */
243                         res =
244                             search_for_position_by_key(inode->i_sb, &key,
245                                                        &path);
246                         if (res == IO_ERROR) {
247                                 res = -EIO;
248                                 goto error_exit;
249                         }
250
251                         /* update changed info for hint structure. */
252                         res =
253                             reiserfs_allocate_blocknrs(&hint, allocated_blocks,
254                                                        blocks_to_allocate,
255                                                        blocks_to_allocate);
256                         if (res != CARRY_ON) {
257                                 res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
258                                 pathrelse(&path);
259                                 goto error_exit;
260                         }
261                 } else {
262                         res = res == QUOTA_EXCEEDED ? -EDQUOT : -ENOSPC;
263                         pathrelse(&path);
264                         goto error_exit;
265                 }
266         }
267 #ifdef __BIG_ENDIAN
268         // Too bad, I have not found any way to convert a given region from
269         // cpu format to little endian format
270         {
271                 int i;
272                 for (i = 0; i < blocks_to_allocate; i++)
273                         allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
274         }
275 #endif
276
277         /* Blocks allocating well might have scheduled and tree might have changed,
278            let's search the tree again */
279         /* find where in the tree our write should go */
280         res = search_for_position_by_key(inode->i_sb, &key, &path);
281         if (res == IO_ERROR) {
282                 res = -EIO;
283                 goto error_exit_free_blocks;
284         }
285
286         bh = get_last_bh(&path);        // Get a bufferhead for last element in path.
287         ih = get_ih(&path);     // Get a pointer to last item head in path.
288         item = get_item(&path); // Get a pointer to last item in path
289
290         /* Let's see what we have found */
291         if (res != POSITION_FOUND) {    /* position not found, this means that we
292                                            might need to append file with holes
293                                            first */
294                 // Since we are writing past the file's end, we need to find out if
295                 // there is a hole that needs to be inserted before our writing
296                 // position, and how many blocks it is going to cover (we need to
297                 //  populate pointers to file blocks representing the hole with zeros)
298
299                 {
300                         int item_offset = 1;
301                         /*
302                          * if ih is stat data, its offset is 0 and we don't want to
303                          * add 1 to pos in the hole_size calculation
304                          */
305                         if (is_statdata_le_ih(ih))
306                                 item_offset = 0;
307                         hole_size = (pos + item_offset -
308                                      (le_key_k_offset
309                                       (get_inode_item_key_version(inode),
310                                        &(ih->ih_key)) + op_bytes_number(ih,
311                                                                         inode->
312                                                                         i_sb->
313                                                                         s_blocksize)))
314                             >> inode->i_sb->s_blocksize_bits;
315                 }
316
317                 if (hole_size > 0) {
318                         int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);   // How much data to insert first time.
319                         /* area filled with zeroes, to supply as list of zero blocknumbers
320                            We allocate it outside of loop just in case loop would spin for
321                            several iterations. */
322                         char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);      // We cannot insert more than MAX_ITEM_LEN bytes anyway.
323                         if (!zeros) {
324                                 res = -ENOMEM;
325                                 goto error_exit_free_blocks;
326                         }
327                         memset(zeros, 0, to_paste * UNFM_P_SIZE);
328                         do {
329                                 to_paste =
330                                     min_t(__u64, hole_size,
331                                           MAX_ITEM_LEN(inode->i_sb->
332                                                        s_blocksize) /
333                                           UNFM_P_SIZE);
334                                 if (is_indirect_le_ih(ih)) {
335                                         /* Ok, there is existing indirect item already. Need to append it */
336                                         /* Calculate position past inserted item */
337                                         make_cpu_key(&key, inode,
338                                                      le_key_k_offset
339                                                      (get_inode_item_key_version
340                                                       (inode),
341                                                       &(ih->ih_key)) +
342                                                      op_bytes_number(ih,
343                                                                      inode->
344                                                                      i_sb->
345                                                                      s_blocksize),
346                                                      TYPE_INDIRECT, 3);
347                                         res =
348                                             reiserfs_paste_into_item(th, &path,
349                                                                      &key,
350                                                                      inode,
351                                                                      (char *)
352                                                                      zeros,
353                                                                      UNFM_P_SIZE
354                                                                      *
355                                                                      to_paste);
356                                         if (res) {
357                                                 kfree(zeros);
358                                                 goto error_exit_free_blocks;
359                                         }
360                                 } else if (is_statdata_le_ih(ih)) {
361                                         /* No existing item, create it */
362                                         /* item head for new item */
363                                         struct item_head ins_ih;
364
365                                         /* create a key for our new item */
366                                         make_cpu_key(&key, inode, 1,
367                                                      TYPE_INDIRECT, 3);
368
369                                         /* Create new item head for our new item */
370                                         make_le_item_head(&ins_ih, &key,
371                                                           key.version, 1,
372                                                           TYPE_INDIRECT,
373                                                           to_paste *
374                                                           UNFM_P_SIZE,
375                                                           0 /* free space */ );
376
377                                         /* Find where such item should live in the tree */
378                                         res =
379                                             search_item(inode->i_sb, &key,
380                                                         &path);
381                                         if (res != ITEM_NOT_FOUND) {
382                                                 /* item should not exist, otherwise we have error */
383                                                 if (res != -ENOSPC) {
384                                                         reiserfs_warning(inode->
385                                                                          i_sb,
386                                                                          "green-9008: search_by_key (%K) returned %d",
387                                                                          &key,
388                                                                          res);
389                                                 }
390                                                 res = -EIO;
391                                                 kfree(zeros);
392                                                 goto error_exit_free_blocks;
393                                         }
394                                         res =
395                                             reiserfs_insert_item(th, &path,
396                                                                  &key, &ins_ih,
397                                                                  inode,
398                                                                  (char *)zeros);
399                                 } else {
400                                         reiserfs_panic(inode->i_sb,
401                                                        "green-9011: Unexpected key type %K\n",
402                                                        &key);
403                                 }
404                                 if (res) {
405                                         kfree(zeros);
406                                         goto error_exit_free_blocks;
407                                 }
408                                 /* Now we want to check if transaction is too full, and if it is
409                                    we restart it. This will also free the path. */
410                                 if (journal_transaction_should_end
411                                     (th, th->t_blocks_allocated)) {
412                                         res =
413                                             restart_transaction(th, inode,
414                                                                 &path);
415                                         if (res) {
416                                                 pathrelse(&path);
417                                                 kfree(zeros);
418                                                 goto error_exit;
419                                         }
420                                 }
421
422                                 /* Well, need to recalculate path and stuff */
423                                 set_cpu_key_k_offset(&key,
424                                                      cpu_key_k_offset(&key) +
425                                                      (to_paste << inode->
426                                                       i_blkbits));
427                                 res =
428                                     search_for_position_by_key(inode->i_sb,
429                                                                &key, &path);
430                                 if (res == IO_ERROR) {
431                                         res = -EIO;
432                                         kfree(zeros);
433                                         goto error_exit_free_blocks;
434                                 }
435                                 bh = get_last_bh(&path);
436                                 ih = get_ih(&path);
437                                 item = get_item(&path);
438                                 hole_size -= to_paste;
439                         } while (hole_size);
440                         kfree(zeros);
441                 }
442         }
443         // Go through existing indirect items first
444         // replace all zeroes with blocknumbers from list
445         // Note that if no corresponding item was found, by previous search,
446         // it means there are no existing in-tree representation for file area
447         // we are going to overwrite, so there is nothing to scan through for holes.
448         for (curr_block = 0, itempos = path.pos_in_item;
449              curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
450               retry:
451
452                 if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
453                         /* We run out of data in this indirect item, let's look for another
454                            one. */
455                         /* First if we are already modifying current item, log it */
456                         if (modifying_this_item) {
457                                 journal_mark_dirty(th, inode->i_sb, bh);
458                                 modifying_this_item = 0;
459                         }
460                         /* Then set the key to look for a new indirect item (offset of old
461                            item is added to old item length */
462                         set_cpu_key_k_offset(&key,
463                                              le_key_k_offset
464                                              (get_inode_item_key_version(inode),
465                                               &(ih->ih_key)) +
466                                              op_bytes_number(ih,
467                                                              inode->i_sb->
468                                                              s_blocksize));
469                         /* Search ofor position of new key in the tree. */
470                         res =
471                             search_for_position_by_key(inode->i_sb, &key,
472                                                        &path);
473                         if (res == IO_ERROR) {
474                                 res = -EIO;
475                                 goto error_exit_free_blocks;
476                         }
477                         bh = get_last_bh(&path);
478                         ih = get_ih(&path);
479                         item = get_item(&path);
480                         itempos = path.pos_in_item;
481                         continue;       // loop to check all kinds of conditions and so on.
482                 }
483                 /* Ok, we have correct position in item now, so let's see if it is
484                    representing file hole (blocknumber is zero) and fill it if needed */
485                 if (!item[itempos]) {
486                         /* Ok, a hole. Now we need to check if we already prepared this
487                            block to be journaled */
488                         while (!modifying_this_item) {  // loop until succeed
489                                 /* Well, this item is not journaled yet, so we must prepare
490                                    it for journal first, before we can change it */
491                                 struct item_head tmp_ih;        // We copy item head of found item,
492                                 // here to detect if fs changed under
493                                 // us while we were preparing for
494                                 // journal.
495                                 int fs_gen;     // We store fs generation here to find if someone
496                                 // changes fs under our feet
497
498                                 copy_item_head(&tmp_ih, ih);    // Remember itemhead
499                                 fs_gen = get_generation(inode->i_sb);   // remember fs generation
500                                 reiserfs_prepare_for_journal(inode->i_sb, bh, 1);       // Prepare a buffer within which indirect item is stored for changing.
501                                 if (fs_changed(fs_gen, inode->i_sb)
502                                     && item_moved(&tmp_ih, &path)) {
503                                         // Sigh, fs was changed under us, we need to look for new
504                                         // location of item we are working with
505
506                                         /* unmark prepaerd area as journaled and search for it's
507                                            new position */
508                                         reiserfs_restore_prepared_buffer(inode->
509                                                                          i_sb,
510                                                                          bh);
511                                         res =
512                                             search_for_position_by_key(inode->
513                                                                        i_sb,
514                                                                        &key,
515                                                                        &path);
516                                         if (res == IO_ERROR) {
517                                                 res = -EIO;
518                                                 goto error_exit_free_blocks;
519                                         }
520                                         bh = get_last_bh(&path);
521                                         ih = get_ih(&path);
522                                         item = get_item(&path);
523                                         itempos = path.pos_in_item;
524                                         goto retry;
525                                 }
526                                 modifying_this_item = 1;
527                         }
528                         item[itempos] = allocated_blocks[curr_block];   // Assign new block
529                         curr_block++;
530                 }
531                 itempos++;
532         }
533
534         if (modifying_this_item) {      // We need to log last-accessed block, if it
535                 // was modified, but not logged yet.
536                 journal_mark_dirty(th, inode->i_sb, bh);
537         }
538
539         if (curr_block < blocks_to_allocate) {
540                 // Oh, well need to append to indirect item, or to create indirect item
541                 // if there weren't any
542                 if (is_indirect_le_ih(ih)) {
543                         // Existing indirect item - append. First calculate key for append
544                         // position. We do not need to recalculate path as it should
545                         // already point to correct place.
546                         make_cpu_key(&key, inode,
547                                      le_key_k_offset(get_inode_item_key_version
548                                                      (inode),
549                                                      &(ih->ih_key)) +
550                                      op_bytes_number(ih,
551                                                      inode->i_sb->s_blocksize),
552                                      TYPE_INDIRECT, 3);
553                         res =
554                             reiserfs_paste_into_item(th, &path, &key, inode,
555                                                      (char *)(allocated_blocks +
556                                                               curr_block),
557                                                      UNFM_P_SIZE *
558                                                      (blocks_to_allocate -
559                                                       curr_block));
560                         if (res) {
561                                 goto error_exit_free_blocks;
562                         }
563                 } else if (is_statdata_le_ih(ih)) {
564                         // Last found item was statdata. That means we need to create indirect item.
565                         struct item_head ins_ih;        /* itemhead for new item */
566
567                         /* create a key for our new item */
568                         make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3); // Position one,
569                         // because that's
570                         // where first
571                         // indirect item
572                         // begins
573                         /* Create new item head for our new item */
574                         make_le_item_head(&ins_ih, &key, key.version, 1,
575                                           TYPE_INDIRECT,
576                                           (blocks_to_allocate -
577                                            curr_block) * UNFM_P_SIZE,
578                                           0 /* free space */ );
579                         /* Find where such item should live in the tree */
580                         res = search_item(inode->i_sb, &key, &path);
581                         if (res != ITEM_NOT_FOUND) {
582                                 /* Well, if we have found such item already, or some error
583                                    occured, we need to warn user and return error */
584                                 if (res != -ENOSPC) {
585                                         reiserfs_warning(inode->i_sb,
586                                                          "green-9009: search_by_key (%K) "
587                                                          "returned %d", &key,
588                                                          res);
589                                 }
590                                 res = -EIO;
591                                 goto error_exit_free_blocks;
592                         }
593                         /* Insert item into the tree with the data as its body */
594                         res =
595                             reiserfs_insert_item(th, &path, &key, &ins_ih,
596                                                  inode,
597                                                  (char *)(allocated_blocks +
598                                                           curr_block));
599                 } else {
600                         reiserfs_panic(inode->i_sb,
601                                        "green-9010: unexpected item type for key %K\n",
602                                        &key);
603                 }
604         }
605         // the caller is responsible for closing the transaction
606         // unless we return an error, they are also responsible for logging
607         // the inode.
608         //
609         pathrelse(&path);
610         /*
611          * cleanup prellocation from previous writes
612          * if this is a partial block write
613          */
614         if (write_bytes & (inode->i_sb->s_blocksize - 1))
615                 reiserfs_discard_prealloc(th, inode);
616         reiserfs_write_unlock(inode->i_sb);
617
618         // go through all the pages/buffers and map the buffers to newly allocated
619         // blocks (so that system knows where to write these pages later).
620         curr_block = 0;
621         for (i = 0; i < num_pages; i++) {
622                 struct page *page = prepared_pages[i];  //current page
623                 struct buffer_head *head = page_buffers(page);  // first buffer for a page
624                 int block_start, block_end;     // in-page offsets for buffers.
625
626                 if (!page_buffers(page))
627                         reiserfs_panic(inode->i_sb,
628                                        "green-9005: No buffers for prepared page???");
629
630                 /* For each buffer in page */
631                 for (bh = head, block_start = 0; bh != head || !block_start;
632                      block_start = block_end, bh = bh->b_this_page) {
633                         if (!bh)
634                                 reiserfs_panic(inode->i_sb,
635                                                "green-9006: Allocated but absent buffer for a page?");
636                         block_end = block_start + inode->i_sb->s_blocksize;
637                         if (i == 0 && block_end <= from)
638                                 /* if this buffer is before requested data to map, skip it */
639                                 continue;
640                         if (i == num_pages - 1 && block_start >= to)
641                                 /* If this buffer is after requested data to map, abort
642                                    processing of current page */
643                                 break;
644
645                         if (!buffer_mapped(bh)) {       // Ok, unmapped buffer, need to map it
646                                 map_bh(bh, inode->i_sb,
647                                        le32_to_cpu(allocated_blocks
648                                                    [curr_block]));
649                                 curr_block++;
650                                 set_buffer_new(bh);
651                         }
652                 }
653         }
654
655         RFALSE(curr_block > blocks_to_allocate,
656                "green-9007: Used too many blocks? weird");
657
658         kfree(allocated_blocks);
659         return 0;
660
661 // Need to deal with transaction here.
662       error_exit_free_blocks:
663         pathrelse(&path);
664         // free blocks
665         for (i = 0; i < blocks_to_allocate; i++)
666                 reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
667                                     1);
668
669       error_exit:
670         if (th->t_trans_id) {
671                 int err;
672                 // update any changes we made to blk count
673                 mark_inode_dirty(inode);
674                 err =
675                     journal_end(th, inode->i_sb,
676                                 JOURNAL_PER_BALANCE_CNT * 3 + 1 +
677                                 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
678                 if (err)
679                         res = err;
680         }
681         reiserfs_write_unlock(inode->i_sb);
682         kfree(allocated_blocks);
683
684         return res;
685 }
686
687 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
688 static void reiserfs_unprepare_pages(struct page **prepared_pages,      /* list of locked pages */
689                                      size_t num_pages /* amount of pages */ )
690 {
691         int i;                  // loop counter
692
693         for (i = 0; i < num_pages; i++) {
694                 struct page *page = prepared_pages[i];
695
696                 try_to_free_buffers(page);
697                 unlock_page(page);
698                 page_cache_release(page);
699         }
700 }
701
702 /* This function will copy data from userspace to specified pages within
703    supplied byte range */
704 static int reiserfs_copy_from_user_to_file_region(loff_t pos,   /* In-file position */
705                                                   int num_pages,        /* Number of pages affected */
706                                                   int write_bytes,      /* Amount of bytes to write */
707                                                   struct page **prepared_pages, /* pointer to 
708                                                                                    array to
709                                                                                    prepared pages
710                                                                                  */
711                                                   const char __user * buf       /* Pointer to user-supplied
712                                                                                    data */
713     )
714 {
715         long page_fault = 0;    // status of copy_from_user.
716         int i;                  // loop counter.
717         int offset;             // offset in page
718
719         for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
720              i++, offset = 0) {
721                 size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);    // How much of bytes to write to this page
722                 struct page *page = prepared_pages[i];  // Current page we process.
723
724                 fault_in_pages_readable(buf, count);
725
726                 /* Copy data from userspace to the current page */
727                 kmap(page);
728                 page_fault = __copy_from_user(page_address(page) + offset, buf, count); // Copy the data.
729                 /* Flush processor's dcache for this page */
730                 flush_dcache_page(page);
731                 kunmap(page);
732                 buf += count;
733                 write_bytes -= count;
734
735                 if (page_fault)
736                         break;  // Was there a fault? abort.
737         }
738
739         return page_fault ? -EFAULT : 0;
740 }
741
742 /* taken fs/buffer.c:__block_commit_write */
743 int reiserfs_commit_page(struct inode *inode, struct page *page,
744                          unsigned from, unsigned to)
745 {
746         unsigned block_start, block_end;
747         int partial = 0;
748         unsigned blocksize;
749         struct buffer_head *bh, *head;
750         unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
751         int new;
752         int logit = reiserfs_file_data_log(inode);
753         struct super_block *s = inode->i_sb;
754         int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
755         struct reiserfs_transaction_handle th;
756         int ret = 0;
757
758         th.t_trans_id = 0;
759         blocksize = 1 << inode->i_blkbits;
760
761         if (logit) {
762                 reiserfs_write_lock(s);
763                 ret = journal_begin(&th, s, bh_per_page + 1);
764                 if (ret)
765                         goto drop_write_lock;
766                 reiserfs_update_inode_transaction(inode);
767         }
768         for (bh = head = page_buffers(page), block_start = 0;
769              bh != head || !block_start;
770              block_start = block_end, bh = bh->b_this_page) {
771
772                 new = buffer_new(bh);
773                 clear_buffer_new(bh);
774                 block_end = block_start + blocksize;
775                 if (block_end <= from || block_start >= to) {
776                         if (!buffer_uptodate(bh))
777                                 partial = 1;
778                 } else {
779                         set_buffer_uptodate(bh);
780                         if (logit) {
781                                 reiserfs_prepare_for_journal(s, bh, 1);
782                                 journal_mark_dirty(&th, s, bh);
783                         } else if (!buffer_dirty(bh)) {
784                                 mark_buffer_dirty(bh);
785                                 /* do data=ordered on any page past the end
786                                  * of file and any buffer marked BH_New.
787                                  */
788                                 if (reiserfs_data_ordered(inode->i_sb) &&
789                                     (new || page->index >= i_size_index)) {
790                                         reiserfs_add_ordered_list(inode, bh);
791                                 }
792                         }
793                 }
794         }
795         if (logit) {
796                 ret = journal_end(&th, s, bh_per_page + 1);
797               drop_write_lock:
798                 reiserfs_write_unlock(s);
799         }
800         /*
801          * If this is a partial write which happened to make all buffers
802          * uptodate then we can optimize away a bogus readpage() for
803          * the next read(). Here we 'discover' whether the page went
804          * uptodate as a result of this (potentially partial) write.
805          */
806         if (!partial)
807                 SetPageUptodate(page);
808         return ret;
809 }
810
811 /* Submit pages for write. This was separated from actual file copying
812    because we might want to allocate block numbers in-between.
813    This function assumes that caller will adjust file size to correct value. */
814 static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,       /* Writing position offset */
815                                                  size_t num_pages,      /* Number of pages to write */
816                                                  size_t write_bytes,    /* number of bytes to write */
817                                                  struct page **prepared_pages   /* list of pages */
818     )
819 {
820         int status;             // return status of block_commit_write.
821         int retval = 0;         // Return value we are going to return.
822         int i;                  // loop counter
823         int offset;             // Writing offset in page.
824         int orig_write_bytes = write_bytes;
825         int sd_update = 0;
826
827         for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
828              i++, offset = 0) {
829                 int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);  // How much of bytes to write to this page
830                 struct page *page = prepared_pages[i];  // Current page we process.
831
832                 status =
833                     reiserfs_commit_page(inode, page, offset, offset + count);
834                 if (status)
835                         retval = status;        // To not overcomplicate matters We are going to
836                 // submit all the pages even if there was error.
837                 // we only remember error status to report it on
838                 // exit.
839                 write_bytes -= count;
840         }
841         /* now that we've gotten all the ordered buffers marked dirty,
842          * we can safely update i_size and close any running transaction
843          */
844         if (pos + orig_write_bytes > inode->i_size) {
845                 inode->i_size = pos + orig_write_bytes; // Set new size
846                 /* If the file have grown so much that tail packing is no
847                  * longer possible, reset "need to pack" flag */
848                 if ((have_large_tails(inode->i_sb) &&
849                      inode->i_size > i_block_size(inode) * 4) ||
850                     (have_small_tails(inode->i_sb) &&
851                      inode->i_size > i_block_size(inode)))
852                         REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
853                 else if ((have_large_tails(inode->i_sb) &&
854                           inode->i_size < i_block_size(inode) * 4) ||
855                          (have_small_tails(inode->i_sb) &&
856                           inode->i_size < i_block_size(inode)))
857                         REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
858
859                 if (th->t_trans_id) {
860                         reiserfs_write_lock(inode->i_sb);
861                         // this sets the proper flags for O_SYNC to trigger a commit
862                         mark_inode_dirty(inode);
863                         reiserfs_write_unlock(inode->i_sb);
864                 } else {
865                         reiserfs_write_lock(inode->i_sb);
866                         reiserfs_update_inode_transaction(inode);
867                         mark_inode_dirty(inode);
868                         reiserfs_write_unlock(inode->i_sb);
869                 }
870
871                 sd_update = 1;
872         }
873         if (th->t_trans_id) {
874                 reiserfs_write_lock(inode->i_sb);
875                 if (!sd_update)
876                         mark_inode_dirty(inode);
877                 status = journal_end(th, th->t_super, th->t_blocks_allocated);
878                 if (status)
879                         retval = status;
880                 reiserfs_write_unlock(inode->i_sb);
881         }
882         th->t_trans_id = 0;
883
884         /* 
885          * we have to unlock the pages after updating i_size, otherwise
886          * we race with writepage
887          */
888         for (i = 0; i < num_pages; i++) {
889                 struct page *page = prepared_pages[i];
890                 unlock_page(page);
891                 mark_page_accessed(page);
892                 page_cache_release(page);
893         }
894         return retval;
895 }
896
897 /* Look if passed writing region is going to touch file's tail
898    (if it is present). And if it is, convert the tail to unformatted node */
899 static int reiserfs_check_for_tail_and_convert(struct inode *inode,     /* inode to deal with */
900                                                loff_t pos,      /* Writing position */
901                                                int write_bytes  /* amount of bytes to write */
902     )
903 {
904         INITIALIZE_PATH(path);  // needed for search_for_position
905         struct cpu_key key;     // Key that would represent last touched writing byte.
906         struct item_head *ih;   // item header of found block;
907         int res;                // Return value of various functions we call.
908         int cont_expand_offset; // We will put offset for generic_cont_expand here
909         // This can be int just because tails are created
910         // only for small files.
911
912 /* this embodies a dependency on a particular tail policy */
913         if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
914                 /* such a big files do not have tails, so we won't bother ourselves
915                    to look for tails, simply return */
916                 return 0;
917         }
918
919         reiserfs_write_lock(inode->i_sb);
920         /* find the item containing the last byte to be written, or if
921          * writing past the end of the file then the last item of the
922          * file (and then we check its type). */
923         make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
924                      3 /*key length */ );
925         res = search_for_position_by_key(inode->i_sb, &key, &path);
926         if (res == IO_ERROR) {
927                 reiserfs_write_unlock(inode->i_sb);
928                 return -EIO;
929         }
930         ih = get_ih(&path);
931         res = 0;
932         if (is_direct_le_ih(ih)) {
933                 /* Ok, closest item is file tail (tails are stored in "direct"
934                  * items), so we need to unpack it. */
935                 /* To not overcomplicate matters, we just call generic_cont_expand
936                    which will in turn call other stuff and finally will boil down to
937                    reiserfs_get_block() that would do necessary conversion. */
938                 cont_expand_offset =
939                     le_key_k_offset(get_inode_item_key_version(inode),
940                                     &(ih->ih_key));
941                 pathrelse(&path);
942                 res = generic_cont_expand(inode, cont_expand_offset);
943         } else
944                 pathrelse(&path);
945
946         reiserfs_write_unlock(inode->i_sb);
947         return res;
948 }
949
950 /* This function locks pages starting from @pos for @inode.
951    @num_pages pages are locked and stored in
952    @prepared_pages array. Also buffers are allocated for these pages.
953    First and last page of the region is read if it is overwritten only
954    partially. If last page did not exist before write (file hole or file
955    append), it is zeroed, then. 
956    Returns number of unallocated blocks that should be allocated to cover
957    new file data.*/
958 static int reiserfs_prepare_file_region_for_write(struct inode *inode
959                                                   /* Inode of the file */ ,
960                                                   loff_t pos,   /* position in the file */
961                                                   size_t num_pages,     /* number of pages to
962                                                                            prepare */
963                                                   size_t write_bytes,   /* Amount of bytes to be
964                                                                            overwritten from
965                                                                            @pos */
966                                                   struct page **prepared_pages  /* pointer to array
967                                                                                    where to store
968                                                                                    prepared pages */
969     )
970 {
971         int res = 0;            // Return values of different functions we call.
972         unsigned long index = pos >> PAGE_CACHE_SHIFT;  // Offset in file in pages.
973         int from = (pos & (PAGE_CACHE_SIZE - 1));       // Writing offset in first page
974         int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
975         /* offset of last modified byte in last
976            page */
977         struct address_space *mapping = inode->i_mapping;       // Pages are mapped here.
978         int i;                  // Simple counter
979         int blocks = 0;         /* Return value (blocks that should be allocated) */
980         struct buffer_head *bh, *head;  // Current bufferhead and first bufferhead
981         // of a page.
982         unsigned block_start, block_end;        // Starting and ending offsets of current
983         // buffer in the page.
984         struct buffer_head *wait[2], **wait_bh = wait;  // Buffers for page, if
985         // Page appeared to be not up
986         // to date. Note how we have
987         // at most 2 buffers, this is
988         // because we at most may
989         // partially overwrite two
990         // buffers for one page. One at                                                 // the beginning of write area
991         // and one at the end.
992         // Everything inthe middle gets                                                 // overwritten totally.
993
994         struct cpu_key key;     // cpu key of item that we are going to deal with
995         struct item_head *ih = NULL;    // pointer to item head that we are going to deal with
996         struct buffer_head *itembuf = NULL;     // Buffer head that contains items that we are going to deal with
997         INITIALIZE_PATH(path);  // path to item, that we are going to deal with.
998         __le32 *item = NULL;    // pointer to item we are going to deal with
999         int item_pos = -1;      /* Position in indirect item */
1000
1001         if (num_pages < 1) {
1002                 reiserfs_warning(inode->i_sb,
1003                                  "green-9001: reiserfs_prepare_file_region_for_write "
1004                                  "called with zero number of pages to process");
1005                 return -EFAULT;
1006         }
1007
1008         /* We have 2 loops for pages. In first loop we grab and lock the pages, so
1009            that nobody would touch these until we release the pages. Then
1010            we'd start to deal with mapping buffers to blocks. */
1011         for (i = 0; i < num_pages; i++) {
1012                 prepared_pages[i] = grab_cache_page(mapping, index + i);        // locks the page
1013                 if (!prepared_pages[i]) {
1014                         res = -ENOMEM;
1015                         goto failed_page_grabbing;
1016                 }
1017                 if (!page_has_buffers(prepared_pages[i]))
1018                         create_empty_buffers(prepared_pages[i],
1019                                              inode->i_sb->s_blocksize, 0);
1020         }
1021
1022         /* Let's count amount of blocks for a case where all the blocks
1023            overwritten are new (we will substract already allocated blocks later) */
1024         if (num_pages > 2)
1025                 /* These are full-overwritten pages so we count all the blocks in
1026                    these pages are counted as needed to be allocated */
1027                 blocks =
1028                     (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1029
1030         /* count blocks needed for first page (possibly partially written) */
1031         blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));   /* roundup */
1032
1033         /* Now we account for last page. If last page == first page (we
1034            overwrite only one page), we substract all the blocks past the
1035            last writing position in a page out of already calculated number
1036            of blocks */
1037         blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
1038             ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
1039         /* Note how we do not roundup here since partial blocks still
1040            should be allocated */
1041
1042         /* Now if all the write area lies past the file end, no point in
1043            maping blocks, since there is none, so we just zero out remaining
1044            parts of first and last pages in write area (if needed) */
1045         if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
1046                 if (from != 0) {        /* First page needs to be partially zeroed */
1047                         char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
1048                         memset(kaddr, 0, from);
1049                         kunmap_atomic(kaddr, KM_USER0);
1050                 }
1051                 if (to != PAGE_CACHE_SIZE) {    /* Last page needs to be partially zeroed */
1052                         char *kaddr =
1053                             kmap_atomic(prepared_pages[num_pages - 1],
1054                                         KM_USER0);
1055                         memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
1056                         kunmap_atomic(kaddr, KM_USER0);
1057                 }
1058
1059                 /* Since all blocks are new - use already calculated value */
1060                 return blocks;
1061         }
1062
1063         /* Well, since we write somewhere into the middle of a file, there is
1064            possibility we are writing over some already allocated blocks, so
1065            let's map these blocks and substract number of such blocks out of blocks
1066            we need to allocate (calculated above) */
1067         /* Mask write position to start on blocksize, we do it out of the
1068            loop for performance reasons */
1069         pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
1070         /* Set cpu key to the starting position in a file (on left block boundary) */
1071         make_cpu_key(&key, inode,
1072                      1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
1073                      TYPE_ANY, 3 /*key length */ );
1074
1075         reiserfs_write_lock(inode->i_sb);       // We need that for at least search_by_key()
1076         for (i = 0; i < num_pages; i++) {
1077
1078                 head = page_buffers(prepared_pages[i]);
1079                 /* For each buffer in the page */
1080                 for (bh = head, block_start = 0; bh != head || !block_start;
1081                      block_start = block_end, bh = bh->b_this_page) {
1082                         if (!bh)
1083                                 reiserfs_panic(inode->i_sb,
1084                                                "green-9002: Allocated but absent buffer for a page?");
1085                         /* Find where this buffer ends */
1086                         block_end = block_start + inode->i_sb->s_blocksize;
1087                         if (i == 0 && block_end <= from)
1088                                 /* if this buffer is before requested data to map, skip it */
1089                                 continue;
1090
1091                         if (i == num_pages - 1 && block_start >= to) {
1092                                 /* If this buffer is after requested data to map, abort
1093                                    processing of current page */
1094                                 break;
1095                         }
1096
1097                         if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1098                                 /* This is optimisation for a case where buffer is mapped
1099                                    and have blocknumber assigned. In case significant amount
1100                                    of such buffers are present, we may avoid some amount
1101                                    of search_by_key calls.
1102                                    Probably it would be possible to move parts of this code
1103                                    out of BKL, but I afraid that would overcomplicate code
1104                                    without any noticeable benefit.
1105                                  */
1106                                 item_pos++;
1107                                 /* Update the key */
1108                                 set_cpu_key_k_offset(&key,
1109                                                      cpu_key_k_offset(&key) +
1110                                                      inode->i_sb->s_blocksize);
1111                                 blocks--;       // Decrease the amount of blocks that need to be
1112                                 // allocated
1113                                 continue;       // Go to the next buffer
1114                         }
1115
1116                         if (!itembuf || /* if first iteration */
1117                             item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {        /* or if we progressed past the
1118                                                                                    current unformatted_item */
1119                                 /* Try to find next item */
1120                                 res =
1121                                     search_for_position_by_key(inode->i_sb,
1122                                                                &key, &path);
1123                                 /* Abort if no more items */
1124                                 if (res != POSITION_FOUND) {
1125                                         /* make sure later loops don't use this item */
1126                                         itembuf = NULL;
1127                                         item = NULL;
1128                                         break;
1129                                 }
1130
1131                                 /* Update information about current indirect item */
1132                                 itembuf = get_last_bh(&path);
1133                                 ih = get_ih(&path);
1134                                 item = get_item(&path);
1135                                 item_pos = path.pos_in_item;
1136
1137                                 RFALSE(!is_indirect_le_ih(ih),
1138                                        "green-9003: indirect item expected");
1139                         }
1140
1141                         /* See if there is some block associated with the file
1142                            at that position, map the buffer to this block */
1143                         if (get_block_num(item, item_pos)) {
1144                                 map_bh(bh, inode->i_sb,
1145                                        get_block_num(item, item_pos));
1146                                 blocks--;       // Decrease the amount of blocks that need to be
1147                                 // allocated
1148                         }
1149                         item_pos++;
1150                         /* Update the key */
1151                         set_cpu_key_k_offset(&key,
1152                                              cpu_key_k_offset(&key) +
1153                                              inode->i_sb->s_blocksize);
1154                 }
1155         }
1156         pathrelse(&path);       // Free the path
1157         reiserfs_write_unlock(inode->i_sb);
1158
1159         /* Now zero out unmappend buffers for the first and last pages of
1160            write area or issue read requests if page is mapped. */
1161         /* First page, see if it is not uptodate */
1162         if (!PageUptodate(prepared_pages[0])) {
1163                 head = page_buffers(prepared_pages[0]);
1164
1165                 /* For each buffer in page */
1166                 for (bh = head, block_start = 0; bh != head || !block_start;
1167                      block_start = block_end, bh = bh->b_this_page) {
1168
1169                         if (!bh)
1170                                 reiserfs_panic(inode->i_sb,
1171                                                "green-9002: Allocated but absent buffer for a page?");
1172                         /* Find where this buffer ends */
1173                         block_end = block_start + inode->i_sb->s_blocksize;
1174                         if (block_end <= from)
1175                                 /* if this buffer is before requested data to map, skip it */
1176                                 continue;
1177                         if (block_start < from) {       /* Aha, our partial buffer */
1178                                 if (buffer_mapped(bh)) {        /* If it is mapped, we need to
1179                                                                    issue READ request for it to
1180                                                                    not loose data */
1181                                         ll_rw_block(READ, 1, &bh);
1182                                         *wait_bh++ = bh;
1183                                 } else {        /* Not mapped, zero it */
1184                                         char *kaddr =
1185                                             kmap_atomic(prepared_pages[0],
1186                                                         KM_USER0);
1187                                         memset(kaddr + block_start, 0,
1188                                                from - block_start);
1189                                         kunmap_atomic(kaddr, KM_USER0);
1190                                         set_buffer_uptodate(bh);
1191                                 }
1192                         }
1193                 }
1194         }
1195
1196         /* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
1197         if (!PageUptodate(prepared_pages[num_pages - 1]) ||
1198             ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
1199             (inode->i_size >> PAGE_CACHE_SHIFT)) {
1200                 head = page_buffers(prepared_pages[num_pages - 1]);
1201
1202                 /* for each buffer in page */
1203                 for (bh = head, block_start = 0; bh != head || !block_start;
1204                      block_start = block_end, bh = bh->b_this_page) {
1205
1206                         if (!bh)
1207                                 reiserfs_panic(inode->i_sb,
1208                                                "green-9002: Allocated but absent buffer for a page?");
1209                         /* Find where this buffer ends */
1210                         block_end = block_start + inode->i_sb->s_blocksize;
1211                         if (block_start >= to)
1212                                 /* if this buffer is after requested data to map, skip it */
1213                                 break;
1214                         if (block_end > to) {   /* Aha, our partial buffer */
1215                                 if (buffer_mapped(bh)) {        /* If it is mapped, we need to
1216                                                                    issue READ request for it to
1217                                                                    not loose data */
1218                                         ll_rw_block(READ, 1, &bh);
1219                                         *wait_bh++ = bh;
1220                                 } else {        /* Not mapped, zero it */
1221                                         char *kaddr =
1222                                             kmap_atomic(prepared_pages
1223                                                         [num_pages - 1],
1224                                                         KM_USER0);
1225                                         memset(kaddr + to, 0, block_end - to);
1226                                         kunmap_atomic(kaddr, KM_USER0);
1227                                         set_buffer_uptodate(bh);
1228                                 }
1229                         }
1230                 }
1231         }
1232
1233         /* Wait for read requests we made to happen, if necessary */
1234         while (wait_bh > wait) {
1235                 wait_on_buffer(*--wait_bh);
1236                 if (!buffer_uptodate(*wait_bh)) {
1237                         res = -EIO;
1238                         goto failed_read;
1239                 }
1240         }
1241
1242         return blocks;
1243       failed_page_grabbing:
1244         num_pages = i;
1245       failed_read:
1246         reiserfs_unprepare_pages(prepared_pages, num_pages);
1247         return res;
1248 }
1249
1250 /* Write @count bytes at position @ppos in a file indicated by @file
1251    from the buffer @buf.  
1252
1253    generic_file_write() is only appropriate for filesystems that are not seeking to optimize performance and want
1254    something simple that works.  It is not for serious use by general purpose filesystems, excepting the one that it was
1255    written for (ext2/3).  This is for several reasons:
1256
1257    * It has no understanding of any filesystem specific optimizations.
1258
1259    * It enters the filesystem repeatedly for each page that is written.
1260
1261    * It depends on reiserfs_get_block() function which if implemented by reiserfs performs costly search_by_key
1262    * operation for each page it is supplied with. By contrast reiserfs_file_write() feeds as much as possible at a time
1263    * to reiserfs which allows for fewer tree traversals.
1264
1265    * Each indirect pointer insertion takes a lot of cpu, because it involves memory moves inside of blocks.
1266
1267    * Asking the block allocation code for blocks one at a time is slightly less efficient.
1268
1269    All of these reasons for not using only generic file write were understood back when reiserfs was first miscoded to
1270    use it, but we were in a hurry to make code freeze, and so it couldn't be revised then.  This new code should make
1271    things right finally.
1272
1273    Future Features: providing search_by_key with hints.
1274
1275 */
1276 static ssize_t reiserfs_file_write(struct file *file,   /* the file we are going to write into */
1277                                    const char __user * buf,     /*  pointer to user supplied data
1278                                                                    (in userspace) */
1279                                    size_t count,        /* amount of bytes to write */
1280                                    loff_t * ppos        /* pointer to position in file that we start writing at. Should be updated to
1281                                                          * new current position before returning. */
1282                                    )
1283 {
1284         size_t already_written = 0;     // Number of bytes already written to the file.
1285         loff_t pos;             // Current position in the file.
1286         ssize_t res;            // return value of various functions that we call.
1287         int err = 0;
1288         struct inode *inode = file->f_dentry->d_inode;  // Inode of the file that we are writing to.
1289         /* To simplify coding at this time, we store
1290            locked pages in array for now */
1291         struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
1292         struct reiserfs_transaction_handle th;
1293         th.t_trans_id = 0;
1294
1295         /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items
1296         * lying around (most of the disk, in fact). Despite the filesystem
1297         * now being a v3.6 format, the old items still can't support large
1298         * file sizes. Catch this case here, as the rest of the VFS layer is
1299         * oblivious to the different limitations between old and new items.
1300         * reiserfs_setattr catches this for truncates. This chunk is lifted
1301         * from generic_write_checks. */
1302         if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 &&
1303             *ppos + count > MAX_NON_LFS) {
1304                 if (*ppos >= MAX_NON_LFS) {
1305                         send_sig(SIGXFSZ, current, 0);
1306                         return -EFBIG;
1307                 }
1308                 if (count > MAX_NON_LFS - (unsigned long)*ppos)
1309                         count = MAX_NON_LFS - (unsigned long)*ppos;
1310         }
1311
1312         if (file->f_flags & O_DIRECT) { // Direct IO needs treatment
1313                 ssize_t result, after_file_end = 0;
1314                 if ((*ppos + count >= inode->i_size)
1315                     || (file->f_flags & O_APPEND)) {
1316                         /* If we are appending a file, we need to put this savelink in here.
1317                            If we will crash while doing direct io, finish_unfinished will
1318                            cut the garbage from the file end. */
1319                         reiserfs_write_lock(inode->i_sb);
1320                         err =
1321                             journal_begin(&th, inode->i_sb,
1322                                           JOURNAL_PER_BALANCE_CNT);
1323                         if (err) {
1324                                 reiserfs_write_unlock(inode->i_sb);
1325                                 return err;
1326                         }
1327                         reiserfs_update_inode_transaction(inode);
1328                         add_save_link(&th, inode, 1 /* Truncate */ );
1329                         after_file_end = 1;
1330                         err =
1331                             journal_end(&th, inode->i_sb,
1332                                         JOURNAL_PER_BALANCE_CNT);
1333                         reiserfs_write_unlock(inode->i_sb);
1334                         if (err)
1335                                 return err;
1336                 }
1337                 result = do_sync_write(file, buf, count, ppos);
1338
1339                 if (after_file_end) {   /* Now update i_size and remove the savelink */
1340                         struct reiserfs_transaction_handle th;
1341                         reiserfs_write_lock(inode->i_sb);
1342                         err = journal_begin(&th, inode->i_sb, 1);
1343                         if (err) {
1344                                 reiserfs_write_unlock(inode->i_sb);
1345                                 return err;
1346                         }
1347                         reiserfs_update_inode_transaction(inode);
1348                         mark_inode_dirty(inode);
1349                         err = journal_end(&th, inode->i_sb, 1);
1350                         if (err) {
1351                                 reiserfs_write_unlock(inode->i_sb);
1352                                 return err;
1353                         }
1354                         err = remove_save_link(inode, 1 /* truncate */ );
1355                         reiserfs_write_unlock(inode->i_sb);
1356                         if (err)
1357                                 return err;
1358                 }
1359
1360                 return result;
1361         }
1362
1363         if (unlikely((ssize_t) count < 0))
1364                 return -EINVAL;
1365
1366         if (unlikely(!access_ok(VERIFY_READ, buf, count)))
1367                 return -EFAULT;
1368
1369         mutex_lock(&inode->i_mutex);    // locks the entire file for just us
1370
1371         pos = *ppos;
1372
1373         /* Check if we can write to specified region of file, file
1374            is not overly big and this kind of stuff. Adjust pos and
1375            count, if needed */
1376         res = generic_write_checks(file, &pos, &count, 0);
1377         if (res)
1378                 goto out;
1379
1380         if (count == 0)
1381                 goto out;
1382
1383         res = remove_suid(file->f_dentry);
1384         if (res)
1385                 goto out;
1386
1387         file_update_time(file);
1388
1389         // Ok, we are done with all the checks.
1390
1391         // Now we should start real work
1392
1393         /* If we are going to write past the file's packed tail or if we are going
1394            to overwrite part of the tail, we need that tail to be converted into
1395            unformatted node */
1396         res = reiserfs_check_for_tail_and_convert(inode, pos, count);
1397         if (res)
1398                 goto out;
1399
1400         while (count > 0) {
1401                 /* This is the main loop in which we running until some error occures
1402                    or until we write all of the data. */
1403                 size_t num_pages;       /* amount of pages we are going to write this iteration */
1404                 size_t write_bytes;     /* amount of bytes to write during this iteration */
1405                 size_t blocks_to_allocate;      /* how much blocks we need to allocate for this iteration */
1406
1407                 /*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
1408                 num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
1409                                                                            pages */
1410                     ((count +
1411                       (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
1412                 /* convert size to amount of
1413                    pages */
1414                 reiserfs_write_lock(inode->i_sb);
1415                 if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
1416                     || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
1417                         /* If we were asked to write more data than we want to or if there
1418                            is not that much space, then we shorten amount of data to write
1419                            for this iteration. */
1420                         num_pages =
1421                             min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
1422                                   reiserfs_can_fit_pages(inode->i_sb));
1423                         /* Also we should not forget to set size in bytes accordingly */
1424                         write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
1425                             (pos & (PAGE_CACHE_SIZE - 1));
1426                         /* If position is not on the
1427                            start of the page, we need
1428                            to substract the offset
1429                            within page */
1430                 } else
1431                         write_bytes = count;
1432
1433                 /* reserve the blocks to be allocated later, so that later on
1434                    we still have the space to write the blocks to */
1435                 reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1436                                                       num_pages <<
1437                                                       (PAGE_CACHE_SHIFT -
1438                                                        inode->i_blkbits));
1439                 reiserfs_write_unlock(inode->i_sb);
1440
1441                 if (!num_pages) {       /* If we do not have enough space even for a single page... */
1442                         if (pos >
1443                             inode->i_size + inode->i_sb->s_blocksize -
1444                             (pos & (inode->i_sb->s_blocksize - 1))) {
1445                                 res = -ENOSPC;
1446                                 break;  // In case we are writing past the end of the last file block, break.
1447                         }
1448                         // Otherwise we are possibly overwriting the file, so
1449                         // let's set write size to be equal or less than blocksize.
1450                         // This way we get it correctly for file holes.
1451                         // But overwriting files on absolutelly full volumes would not
1452                         // be very efficient. Well, people are not supposed to fill
1453                         // 100% of disk space anyway.
1454                         write_bytes =
1455                             min_t(size_t, count,
1456                                   inode->i_sb->s_blocksize -
1457                                   (pos & (inode->i_sb->s_blocksize - 1)));
1458                         num_pages = 1;
1459                         // No blocks were claimed before, so do it now.
1460                         reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
1461                                                               1 <<
1462                                                               (PAGE_CACHE_SHIFT
1463                                                                -
1464                                                                inode->
1465                                                                i_blkbits));
1466                 }
1467
1468                 /* Prepare for writing into the region, read in all the
1469                    partially overwritten pages, if needed. And lock the pages,
1470                    so that nobody else can access these until we are done.
1471                    We get number of actual blocks needed as a result. */
1472                 res = reiserfs_prepare_file_region_for_write(inode, pos,
1473                                                              num_pages,
1474                                                              write_bytes,
1475                                                              prepared_pages);
1476                 if (res < 0) {
1477                         reiserfs_release_claimed_blocks(inode->i_sb,
1478                                                         num_pages <<
1479                                                         (PAGE_CACHE_SHIFT -
1480                                                          inode->i_blkbits));
1481                         break;
1482                 }
1483
1484                 blocks_to_allocate = res;
1485
1486                 /* First we correct our estimate of how many blocks we need */
1487                 reiserfs_release_claimed_blocks(inode->i_sb,
1488                                                 (num_pages <<
1489                                                  (PAGE_CACHE_SHIFT -
1490                                                   inode->i_sb->
1491                                                   s_blocksize_bits)) -
1492                                                 blocks_to_allocate);
1493
1494                 if (blocks_to_allocate > 0) {   /*We only allocate blocks if we need to */
1495                         /* Fill in all the possible holes and append the file if needed */
1496                         res =
1497                             reiserfs_allocate_blocks_for_region(&th, inode, pos,
1498                                                                 num_pages,
1499                                                                 write_bytes,
1500                                                                 prepared_pages,
1501                                                                 blocks_to_allocate);
1502                 }
1503
1504                 /* well, we have allocated the blocks, so it is time to free
1505                    the reservation we made earlier. */
1506                 reiserfs_release_claimed_blocks(inode->i_sb,
1507                                                 blocks_to_allocate);
1508                 if (res) {
1509                         reiserfs_unprepare_pages(prepared_pages, num_pages);
1510                         break;
1511                 }
1512
1513 /* NOTE that allocating blocks and filling blocks can be done in reverse order
1514    and probably we would do that just to get rid of garbage in files after a
1515    crash */
1516
1517                 /* Copy data from user-supplied buffer to file's pages */
1518                 res =
1519                     reiserfs_copy_from_user_to_file_region(pos, num_pages,
1520                                                            write_bytes,
1521                                                            prepared_pages, buf);
1522                 if (res) {
1523                         reiserfs_unprepare_pages(prepared_pages, num_pages);
1524                         break;
1525                 }
1526
1527                 /* Send the pages to disk and unlock them. */
1528                 res =
1529                     reiserfs_submit_file_region_for_write(&th, inode, pos,
1530                                                           num_pages,
1531                                                           write_bytes,
1532                                                           prepared_pages);
1533                 if (res)
1534                         break;
1535
1536                 already_written += write_bytes;
1537                 buf += write_bytes;
1538                 *ppos = pos += write_bytes;
1539                 count -= write_bytes;
1540                 balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
1541         }
1542
1543         /* this is only true on error */
1544         if (th.t_trans_id) {
1545                 reiserfs_write_lock(inode->i_sb);
1546                 err = journal_end(&th, th.t_super, th.t_blocks_allocated);
1547                 reiserfs_write_unlock(inode->i_sb);
1548                 if (err) {
1549                         res = err;
1550                         goto out;
1551                 }
1552         }
1553
1554         if (likely(res >= 0) &&
1555             (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))))
1556                 res = generic_osync_inode(inode, file->f_mapping,
1557                                           OSYNC_METADATA | OSYNC_DATA);
1558
1559         mutex_unlock(&inode->i_mutex);
1560         reiserfs_async_progress_wait(inode->i_sb);
1561         return (already_written != 0) ? already_written : res;
1562
1563       out:
1564         mutex_unlock(&inode->i_mutex);  // unlock the file on exit.
1565         return res;
1566 }
1567
1568 const struct file_operations reiserfs_file_operations = {
1569         .read = do_sync_read,
1570         .write = reiserfs_file_write,
1571         .ioctl = reiserfs_ioctl,
1572 #ifdef CONFIG_COMPAT
1573         .compat_ioctl = reiserfs_compat_ioctl,
1574 #endif
1575         .mmap = generic_file_mmap,
1576         .open = generic_file_open,
1577         .release = reiserfs_file_release,
1578         .fsync = reiserfs_sync_file,
1579         .sendfile = generic_file_sendfile,
1580         .aio_read = generic_file_aio_read,
1581         .aio_write = generic_file_aio_write,
1582         .splice_read = generic_file_splice_read,
1583         .splice_write = generic_file_splice_write,
1584 };
1585
1586 struct inode_operations reiserfs_file_inode_operations = {
1587         .truncate = reiserfs_vfs_truncate_file,
1588         .setattr = reiserfs_setattr,
1589         .setxattr = reiserfs_setxattr,
1590         .getxattr = reiserfs_getxattr,
1591         .listxattr = reiserfs_listxattr,
1592         .removexattr = reiserfs_removexattr,
1593         .permission = reiserfs_permission,
1594 };