fs/ocfs2/file.c

   1 /* -*- mode: c; c-basic-offset: 8; -*-
   2  * vim: noexpandtab sw=8 ts=8 sts=0:
   3  *
   4  * file.c
   5  *
   6  * File open, close, extend, truncate
   7  *
   8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2 of the License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public
  21  * License along with this program; if not, write to the
  22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23  * Boston, MA 021110-1307, USA.
  24  */
  25
  26 #include <linux/fs.h>
  27 #include <linux/types.h>
  28 #include <linux/slab.h>
  29 #include <linux/highmem.h>
  30 #include <linux/pagemap.h>
  31 #include <linux/uio.h>
  32
  33 #define MLOG_MASK_PREFIX ML_INODE
  34 #include <cluster/masklog.h>
  35
  36 #include "ocfs2.h"
  37
  38 #include "alloc.h"
  39 #include "aops.h"
  40 #include "dir.h"
  41 #include "dlmglue.h"
  42 #include "extent_map.h"
  43 #include "file.h"
  44 #include "sysfile.h"
  45 #include "inode.h"
  46 #include "journal.h"
  47 #include "mmap.h"
  48 #include "suballoc.h"
  49 #include "super.h"
  50
  51 #include "buffer_head_io.h"
  52
  53 static int ocfs2_sync_inode(struct inode *inode)
  54 {
  55         filemap_fdatawrite(inode->i_mapping);
  56         return sync_mapping_buffers(inode->i_mapping);
  57 }
  58
  59 static int ocfs2_file_open(struct inode *inode, struct file *file)
  60 {
  61         int status;
  62         int mode = file->f_flags;
  63         struct ocfs2_inode_info *oi = OCFS2_I(inode);
  64
  65         mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
  66                    file->f_dentry->d_name.len, file->f_dentry->d_name.name);
  67
  68         spin_lock(&oi->ip_lock);
  69
  70         /* Check that the inode hasn't been wiped from disk by another
  71          * node. If it hasn't then we're safe as long as we hold the
  72          * spin lock until our increment of open count. */
  73         if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
  74                 spin_unlock(&oi->ip_lock);
  75
  76                 status = -ENOENT;
  77                 goto leave;
  78         }
  79
  80         if (mode & O_DIRECT)
  81                 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
  82
  83         oi->ip_open_count++;
  84         spin_unlock(&oi->ip_lock);
  85         status = 0;
  86 leave:
  87         mlog_exit(status);
  88         return status;
  89 }
  90
  91 static int ocfs2_file_release(struct inode *inode, struct file *file)
  92 {
  93         struct ocfs2_inode_info *oi = OCFS2_I(inode);
  94
  95         mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
  96                        file->f_dentry->d_name.len,
  97                        file->f_dentry->d_name.name);
  98
  99         spin_lock(&oi->ip_lock);
 100         if (!--oi->ip_open_count)
 101                 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
 102         spin_unlock(&oi->ip_lock);
 103
 104         mlog_exit(0);
 105
 106         return 0;
 107 }
 108
 109 static int ocfs2_sync_file(struct file *file,
 110                            struct dentry *dentry,
 111                            int datasync)
 112 {
 113         int err = 0;
 114         journal_t *journal;
 115         struct inode *inode = dentry->d_inode;
 116         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 117
 118         mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
 119                    dentry->d_name.len, dentry->d_name.name);
 120
 121         err = ocfs2_sync_inode(dentry->d_inode);
 122         if (err)
 123                 goto bail;
 124
 125         journal = osb->journal->j_journal;
 126         err = journal_force_commit(journal);
 127
 128 bail:
 129         mlog_exit(err);
 130
 131         return (err < 0) ? -EIO : 0;
 132 }
 133
 134 int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
 135                          struct inode *inode,
 136                          struct buffer_head *fe_bh,
 137                          u64 new_i_size)
 138 {
 139         int status;
 140
 141         mlog_entry_void();
 142         i_size_write(inode, new_i_size);
 143         inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
 144         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 145
 146         status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 147         if (status < 0) {
 148                 mlog_errno(status);
 149                 goto bail;
 150         }
 151
 152 bail:
 153         mlog_exit(status);
 154         return status;
 155 }
 156
 157 static int ocfs2_simple_size_update(struct inode *inode,
 158                                     struct buffer_head *di_bh,
 159                                     u64 new_i_size)
 160 {
 161         int ret;
 162         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 163         struct ocfs2_journal_handle *handle = NULL;
 164
 165         handle = ocfs2_start_trans(osb, NULL,
 166                                    OCFS2_INODE_UPDATE_CREDITS);
 167         if (handle == NULL) {
 168                 ret = -ENOMEM;
 169                 mlog_errno(ret);
 170                 goto out;
 171         }
 172
 173         ret = ocfs2_set_inode_size(handle, inode, di_bh,
 174                                    new_i_size);
 175         if (ret < 0)
 176                 mlog_errno(ret);
 177
 178         ocfs2_commit_trans(handle);
 179 out:
 180         return ret;
 181 }
 182
 183 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 184                                      struct inode *inode,
 185                                      struct buffer_head *fe_bh,
 186                                      u64 new_i_size)
 187 {
 188         int status;
 189         struct ocfs2_journal_handle *handle;
 190
 191         mlog_entry_void();
 192
 193         /* TODO: This needs to actually orphan the inode in this
 194          * transaction. */
 195
 196         handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
 197         if (IS_ERR(handle)) {
 198                 status = PTR_ERR(handle);
 199                 mlog_errno(status);
 200                 goto out;
 201         }
 202
 203         status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
 204         if (status < 0)
 205                 mlog_errno(status);
 206
 207         ocfs2_commit_trans(handle);
 208 out:
 209         mlog_exit(status);
 210         return status;
 211 }
 212
 213 static int ocfs2_truncate_file(struct inode *inode,
 214                                struct buffer_head *di_bh,
 215                                u64 new_i_size)
 216 {
 217         int status = 0;
 218         struct ocfs2_dinode *fe = NULL;
 219         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 220         struct ocfs2_truncate_context *tc = NULL;
 221
 222         mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
 223                    OCFS2_I(inode)->ip_blkno, new_i_size);
 224
 225         truncate_inode_pages(inode->i_mapping, new_i_size);
 226
 227         fe = (struct ocfs2_dinode *) di_bh->b_data;
 228         if (!OCFS2_IS_VALID_DINODE(fe)) {
 229                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
 230                 status = -EIO;
 231                 goto bail;
 232         }
 233
 234         mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 235                         "Inode %"MLFu64", inode i_size = %lld != di "
 236                         "i_size = %"MLFu64", i_flags = 0x%x\n",
 237                         OCFS2_I(inode)->ip_blkno,
 238                         i_size_read(inode),
 239                         le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
 240
 241         if (new_i_size > le64_to_cpu(fe->i_size)) {
 242                 mlog(0, "asked to truncate file with size (%"MLFu64") "
 243                      "to size (%"MLFu64")!\n",
 244                      le64_to_cpu(fe->i_size), new_i_size);
 245                 status = -EINVAL;
 246                 mlog_errno(status);
 247                 goto bail;
 248         }
 249
 250         mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
 251              le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
 252
 253         /* lets handle the simple truncate cases before doing any more
 254          * cluster locking. */
 255         if (new_i_size == le64_to_cpu(fe->i_size))
 256                 goto bail;
 257
 258         if (le32_to_cpu(fe->i_clusters) ==
 259             ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
 260                 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
 261                      fe->i_clusters);
 262                 /* No allocation change is required, so lets fast path
 263                  * this truncate. */
 264                 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 265                 if (status < 0)
 266                         mlog_errno(status);
 267                 goto bail;
 268         }
 269
 270         /* This forces other nodes to sync and drop their pages */
 271         status = ocfs2_data_lock(inode, 1);
 272         if (status < 0) {
 273                 mlog_errno(status);
 274                 goto bail;
 275         }
 276         ocfs2_data_unlock(inode, 1);
 277
 278         /* alright, we're going to need to do a full blown alloc size
 279          * change. Orphan the inode so that recovery can complete the
 280          * truncate if necessary. This does the task of marking
 281          * i_size. */
 282         status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 283         if (status < 0) {
 284                 mlog_errno(status);
 285                 goto bail;
 286         }
 287
 288         status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 289         if (status < 0) {
 290                 mlog_errno(status);
 291                 goto bail;
 292         }
 293
 294         status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 295         if (status < 0) {
 296                 mlog_errno(status);
 297                 goto bail;
 298         }
 299
 300         /* TODO: orphan dir cleanup here. */
 301 bail:
 302
 303         mlog_exit(status);
 304         return status;
 305 }
 306
 307 /*
 308  * extend allocation only here.
 309  * we'll update all the disk stuff, and oip->alloc_size
 310  *
 311  * expect stuff to be locked, a transaction started and enough data /
 312  * metadata reservations in the contexts.
 313  *
 314  * Will return -EAGAIN, and a reason if a restart is needed.
 315  * If passed in, *reason will always be set, even in error.
 316  */
 317 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 318                                struct inode *inode,
 319                                u32 clusters_to_add,
 320                                struct buffer_head *fe_bh,
 321                                struct ocfs2_journal_handle *handle,
 322                                struct ocfs2_alloc_context *data_ac,
 323                                struct ocfs2_alloc_context *meta_ac,
 324                                enum ocfs2_alloc_restarted *reason_ret)
 325 {
 326         int status = 0;
 327         int free_extents;
 328         struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 329         enum ocfs2_alloc_restarted reason = RESTART_NONE;
 330         u32 bit_off, num_bits;
 331         u64 block;
 332
 333         BUG_ON(!clusters_to_add);
 334
 335         free_extents = ocfs2_num_free_extents(osb, inode, fe);
 336         if (free_extents < 0) {
 337                 status = free_extents;
 338                 mlog_errno(status);
 339                 goto leave;
 340         }
 341
 342         /* there are two cases which could cause us to EAGAIN in the
 343          * we-need-more-metadata case:
 344          * 1) we haven't reserved *any*
 345          * 2) we are so fragmented, we've needed to add metadata too
 346          *    many times. */
 347         if (!free_extents && !meta_ac) {
 348                 mlog(0, "we haven't reserved any metadata!\n");
 349                 status = -EAGAIN;
 350                 reason = RESTART_META;
 351                 goto leave;
 352         } else if ((!free_extents)
 353                    && (ocfs2_alloc_context_bits_left(meta_ac)
 354                        < ocfs2_extend_meta_needed(fe))) {
 355                 mlog(0, "filesystem is really fragmented...\n");
 356                 status = -EAGAIN;
 357                 reason = RESTART_META;
 358                 goto leave;
 359         }
 360
 361         status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
 362                                       &bit_off, &num_bits);
 363         if (status < 0) {
 364                 if (status != -ENOSPC)
 365                         mlog_errno(status);
 366                 goto leave;
 367         }
 368
 369         BUG_ON(num_bits > clusters_to_add);
 370
 371         /* reserve our write early -- insert_extent may update the inode */
 372         status = ocfs2_journal_access(handle, inode, fe_bh,
 373                                       OCFS2_JOURNAL_ACCESS_WRITE);
 374         if (status < 0) {
 375                 mlog_errno(status);
 376                 goto leave;
 377         }
 378
 379         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 380         mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
 381              num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
 382         status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
 383                                      num_bits, meta_ac);
 384         if (status < 0) {
 385                 mlog_errno(status);
 386                 goto leave;
 387         }
 388
 389         le32_add_cpu(&fe->i_clusters, num_bits);
 390         spin_lock(&OCFS2_I(inode)->ip_lock);
 391         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 392         spin_unlock(&OCFS2_I(inode)->ip_lock);
 393
 394         status = ocfs2_journal_dirty(handle, fe_bh);
 395         if (status < 0) {
 396                 mlog_errno(status);
 397                 goto leave;
 398         }
 399
 400         clusters_to_add -= num_bits;
 401
 402         if (clusters_to_add) {
 403                 mlog(0, "need to alloc once more, clusters = %u, wanted = "
 404                      "%u\n", fe->i_clusters, clusters_to_add);
 405                 status = -EAGAIN;
 406                 reason = RESTART_TRANS;
 407         }
 408
 409 leave:
 410         mlog_exit(status);
 411         if (reason_ret)
 412                 *reason_ret = reason;
 413         return status;
 414 }
 415
 416 static int ocfs2_extend_allocation(struct inode *inode,
 417                                    u32 clusters_to_add)
 418 {
 419         int status = 0;
 420         int restart_func = 0;
 421         int drop_alloc_sem = 0;
 422         int credits, num_free_extents;
 423         u32 prev_clusters;
 424         struct buffer_head *bh = NULL;
 425         struct ocfs2_dinode *fe = NULL;
 426         struct ocfs2_journal_handle *handle = NULL;
 427         struct ocfs2_alloc_context *data_ac = NULL;
 428         struct ocfs2_alloc_context *meta_ac = NULL;
 429         enum ocfs2_alloc_restarted why;
 430         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 431
 432         mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 433
 434         status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 435                                   OCFS2_BH_CACHED, inode);
 436         if (status < 0) {
 437                 mlog_errno(status);
 438                 goto leave;
 439         }
 440
 441         fe = (struct ocfs2_dinode *) bh->b_data;
 442         if (!OCFS2_IS_VALID_DINODE(fe)) {
 443                 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
 444                 status = -EIO;
 445                 goto leave;
 446         }
 447
 448 restart_all:
 449         BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 450
 451         mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
 452              "clusters_to_add = %u\n",
 453              OCFS2_I(inode)->ip_blkno, i_size_read(inode),
 454              fe->i_clusters, clusters_to_add);
 455
 456         handle = ocfs2_alloc_handle(osb);
 457         if (handle == NULL) {
 458                 status = -ENOMEM;
 459                 mlog_errno(status);
 460                 goto leave;
 461         }
 462
 463         num_free_extents = ocfs2_num_free_extents(osb,
 464                                                   inode,
 465                                                   fe);
 466         if (num_free_extents < 0) {
 467                 status = num_free_extents;
 468                 mlog_errno(status);
 469                 goto leave;
 470         }
 471
 472         if (!num_free_extents) {
 473                 status = ocfs2_reserve_new_metadata(osb,
 474                                                     handle,
 475                                                     fe,
 476                                                     &meta_ac);
 477                 if (status < 0) {
 478                         if (status != -ENOSPC)
 479                                 mlog_errno(status);
 480                         goto leave;
 481                 }
 482         }
 483
 484         status = ocfs2_reserve_clusters(osb,
 485                                         handle,
 486                                         clusters_to_add,
 487                                         &data_ac);
 488         if (status < 0) {
 489                 if (status != -ENOSPC)
 490                         mlog_errno(status);
 491                 goto leave;
 492         }
 493
 494         /* blocks peope in read/write from reading our allocation
 495          * until we're done changing it. We depend on i_mutex to block
 496          * other extend/truncate calls while we're here. Ordering wrt
 497          * start_trans is important here -- always do it before! */
 498         down_write(&OCFS2_I(inode)->ip_alloc_sem);
 499         drop_alloc_sem = 1;
 500
 501         credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
 502         handle = ocfs2_start_trans(osb, handle, credits);
 503         if (IS_ERR(handle)) {
 504                 status = PTR_ERR(handle);
 505                 handle = NULL;
 506                 mlog_errno(status);
 507                 goto leave;
 508         }
 509
 510 restarted_transaction:
 511         /* reserve a write to the file entry early on - that we if we
 512          * run out of credits in the allocation path, we can still
 513          * update i_size. */
 514         status = ocfs2_journal_access(handle, inode, bh,
 515                                       OCFS2_JOURNAL_ACCESS_WRITE);
 516         if (status < 0) {
 517                 mlog_errno(status);
 518                 goto leave;
 519         }
 520
 521         prev_clusters = OCFS2_I(inode)->ip_clusters;
 522
 523         status = ocfs2_do_extend_allocation(osb,
 524                                             inode,
 525                                             clusters_to_add,
 526                                             bh,
 527                                             handle,
 528                                             data_ac,
 529                                             meta_ac,
 530                                             &why);
 531         if ((status < 0) && (status != -EAGAIN)) {
 532                 if (status != -ENOSPC)
 533                         mlog_errno(status);
 534                 goto leave;
 535         }
 536
 537         status = ocfs2_journal_dirty(handle, bh);
 538         if (status < 0) {
 539                 mlog_errno(status);
 540                 goto leave;
 541         }
 542
 543         spin_lock(&OCFS2_I(inode)->ip_lock);
 544         clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 545         spin_unlock(&OCFS2_I(inode)->ip_lock);
 546
 547         if (why != RESTART_NONE && clusters_to_add) {
 548                 if (why == RESTART_META) {
 549                         mlog(0, "restarting function.\n");
 550                         restart_func = 1;
 551                 } else {
 552                         BUG_ON(why != RESTART_TRANS);
 553
 554                         mlog(0, "restarting transaction.\n");
 555                         /* TODO: This can be more intelligent. */
 556                         credits = ocfs2_calc_extend_credits(osb->sb,
 557                                                             fe,
 558                                                             clusters_to_add);
 559                         status = ocfs2_extend_trans(handle, credits);
 560                         if (status < 0) {
 561                                 /* handle still has to be committed at
 562                                  * this point. */
 563                                 status = -ENOMEM;
 564                                 mlog_errno(status);
 565                                 goto leave;
 566                         }
 567                         goto restarted_transaction;
 568                 }
 569         }
 570
 571         mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
 572              fe->i_clusters, fe->i_size);
 573         mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
 574              OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 575
 576 leave:
 577         if (drop_alloc_sem) {
 578                 up_write(&OCFS2_I(inode)->ip_alloc_sem);
 579                 drop_alloc_sem = 0;
 580         }
 581         if (handle) {
 582                 ocfs2_commit_trans(handle);
 583                 handle = NULL;
 584         }
 585         if (data_ac) {
 586                 ocfs2_free_alloc_context(data_ac);
 587                 data_ac = NULL;
 588         }
 589         if (meta_ac) {
 590                 ocfs2_free_alloc_context(meta_ac);
 591                 meta_ac = NULL;
 592         }
 593         if ((!status) && restart_func) {
 594                 restart_func = 0;
 595                 goto restart_all;
 596         }
 597         if (bh) {
 598                 brelse(bh);
 599                 bh = NULL;
 600         }
 601
 602         mlog_exit(status);
 603         return status;
 604 }
 605
 606 /* Some parts of this taken from generic_cont_expand, which turned out
 607  * to be too fragile to do exactly what we need without us having to
 608  * worry about recursive locking in ->commit_write(). */
 609 static int ocfs2_write_zero_page(struct inode *inode,
 610                                  u64 size)
 611 {
 612         struct address_space *mapping = inode->i_mapping;
 613         struct page *page;
 614         unsigned long index;
 615         unsigned int offset;
 616         struct ocfs2_journal_handle *handle = NULL;
 617         int ret;
 618
 619         offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
 620         /* ugh.  in prepare/commit_write, if from==to==start of block, we
 621         ** skip the prepare.  make sure we never send an offset for the start
 622         ** of a block
 623         */
 624         if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
 625                 offset++;
 626         }
 627         index = size >> PAGE_CACHE_SHIFT;
 628
 629         page = grab_cache_page(mapping, index);
 630         if (!page) {
 631                 ret = -ENOMEM;
 632                 mlog_errno(ret);
 633                 goto out;
 634         }
 635
 636         ret = ocfs2_prepare_write(NULL, page, offset, offset);
 637         if (ret < 0) {
 638                 mlog_errno(ret);
 639                 goto out_unlock;
 640         }
 641
 642         if (ocfs2_should_order_data(inode)) {
 643                 handle = ocfs2_start_walk_page_trans(inode, page, offset,
 644                                                      offset);
 645                 if (IS_ERR(handle)) {
 646                         ret = PTR_ERR(handle);
 647                         handle = NULL;
 648                         goto out_unlock;
 649                 }
 650         }
 651
 652         /* must not update i_size! */
 653         ret = block_commit_write(page, offset, offset);
 654         if (ret < 0)
 655                 mlog_errno(ret);
 656         else
 657                 ret = 0;
 658
 659         if (handle)
 660                 ocfs2_commit_trans(handle);
 661 out_unlock:
 662         unlock_page(page);
 663         page_cache_release(page);
 664 out:
 665         return ret;
 666 }
 667
 668 static int ocfs2_zero_extend(struct inode *inode,
 669                              u64 zero_to_size)
 670 {
 671         int ret = 0;
 672         u64 start_off;
 673         struct super_block *sb = inode->i_sb;
 674
 675         start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 676         while (start_off < zero_to_size) {
 677                 ret = ocfs2_write_zero_page(inode, start_off);
 678                 if (ret < 0) {
 679                         mlog_errno(ret);
 680                         goto out;
 681                 }
 682
 683                 start_off += sb->s_blocksize;
 684         }
 685
 686 out:
 687         return ret;
 688 }
 689
 690 static int ocfs2_extend_file(struct inode *inode,
 691                              struct buffer_head *di_bh,
 692                              u64 new_i_size)
 693 {
 694         int ret = 0;
 695         u32 clusters_to_add;
 696
 697         /* setattr sometimes calls us like this. */
 698         if (new_i_size == 0)
 699                 goto out;
 700
 701         if (i_size_read(inode) == new_i_size)
 702                 goto out;
 703         BUG_ON(new_i_size < i_size_read(inode));
 704
 705         clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
 706                 OCFS2_I(inode)->ip_clusters;
 707
 708         if (clusters_to_add) {
 709                 ret = ocfs2_extend_allocation(inode, clusters_to_add);
 710                 if (ret < 0) {
 711                         mlog_errno(ret);
 712                         goto out;
 713                 }
 714
 715                 ret = ocfs2_zero_extend(inode, new_i_size);
 716                 if (ret < 0) {
 717                         mlog_errno(ret);
 718                         goto out;
 719                 }
 720         }
 721
 722         /* No allocation required, we just use this helper to
 723          * do a trivial update of i_size. */
 724         ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
 725         if (ret < 0) {
 726                 mlog_errno(ret);
 727                 goto out;
 728         }
 729
 730 out:
 731         return ret;
 732 }
 733
 734 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 735 {
 736         int status = 0, size_change;
 737         struct inode *inode = dentry->d_inode;
 738         struct super_block *sb = inode->i_sb;
 739         struct ocfs2_super *osb = OCFS2_SB(sb);
 740         struct buffer_head *bh = NULL;
 741         struct ocfs2_journal_handle *handle = NULL;
 742
 743         mlog_entry("(0x%p, '%.*s')\n", dentry,
 744                    dentry->d_name.len, dentry->d_name.name);
 745
 746         if (attr->ia_valid & ATTR_MODE)
 747                 mlog(0, "mode change: %d\n", attr->ia_mode);
 748         if (attr->ia_valid & ATTR_UID)
 749                 mlog(0, "uid change: %d\n", attr->ia_uid);
 750         if (attr->ia_valid & ATTR_GID)
 751                 mlog(0, "gid change: %d\n", attr->ia_gid);
 752         if (attr->ia_valid & ATTR_SIZE)
 753                 mlog(0, "size change...\n");
 754         if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
 755                 mlog(0, "time change...\n");
 756
 757 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
 758                            | ATTR_GID | ATTR_UID | ATTR_MODE)
 759         if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
 760                 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
 761                 return 0;
 762         }
 763
 764         status = inode_change_ok(inode, attr);
 765         if (status)
 766                 return status;
 767
 768         size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
 769         if (size_change) {
 770                 status = ocfs2_rw_lock(inode, 1);
 771                 if (status < 0) {
 772                         mlog_errno(status);
 773                         goto bail;
 774                 }
 775         }
 776
 777         status = ocfs2_meta_lock(inode, NULL, &bh, 1);
 778         if (status < 0) {
 779                 if (status != -ENOENT)
 780                         mlog_errno(status);
 781                 goto bail_unlock_rw;
 782         }
 783
 784         if (size_change && attr->ia_size != i_size_read(inode)) {
 785                 if (i_size_read(inode) > attr->ia_size)
 786                         status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 787                 else
 788                         status = ocfs2_extend_file(inode, bh, attr->ia_size);
 789                 if (status < 0) {
 790                         if (status != -ENOSPC)
 791                                 mlog_errno(status);
 792                         status = -ENOSPC;
 793                         goto bail_unlock;
 794                 }
 795         }
 796
 797         handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
 798         if (IS_ERR(handle)) {
 799                 status = PTR_ERR(handle);
 800                 mlog_errno(status);
 801                 goto bail_unlock;
 802         }
 803
 804         status = inode_setattr(inode, attr);
 805         if (status < 0) {
 806                 mlog_errno(status);
 807                 goto bail_commit;
 808         }
 809
 810         status = ocfs2_mark_inode_dirty(handle, inode, bh);
 811         if (status < 0)
 812                 mlog_errno(status);
 813
 814 bail_commit:
 815         ocfs2_commit_trans(handle);
 816 bail_unlock:
 817         ocfs2_meta_unlock(inode, 1);
 818 bail_unlock_rw:
 819         if (size_change)
 820                 ocfs2_rw_unlock(inode, 1);
 821 bail:
 822         if (bh)
 823                 brelse(bh);
 824
 825         mlog_exit(status);
 826         return status;
 827 }
 828
 829 int ocfs2_getattr(struct vfsmount *mnt,
 830                   struct dentry *dentry,
 831                   struct kstat *stat)
 832 {
 833         struct inode *inode = dentry->d_inode;
 834         struct super_block *sb = dentry->d_inode->i_sb;
 835         struct ocfs2_super *osb = sb->s_fs_info;
 836         int err;
 837
 838         mlog_entry_void();
 839
 840         err = ocfs2_inode_revalidate(dentry);
 841         if (err) {
 842                 if (err != -ENOENT)
 843                         mlog_errno(err);
 844                 goto bail;
 845         }
 846
 847         generic_fillattr(inode, stat);
 848
 849         /* We set the blksize from the cluster size for performance */
 850         stat->blksize = osb->s_clustersize;
 851
 852 bail:
 853         mlog_exit(err);
 854
 855         return err;
 856 }
 857
 858 static int ocfs2_write_remove_suid(struct inode *inode)
 859 {
 860         int ret;
 861         struct buffer_head *bh = NULL;
 862         struct ocfs2_inode_info *oi = OCFS2_I(inode);
 863         struct ocfs2_journal_handle *handle;
 864         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 865         struct ocfs2_dinode *di;
 866
 867         mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
 868                    inode->i_mode);
 869
 870         handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
 871         if (handle == NULL) {
 872                 ret = -ENOMEM;
 873                 mlog_errno(ret);
 874                 goto out;
 875         }
 876
 877         ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
 878         if (ret < 0) {
 879                 mlog_errno(ret);
 880                 goto out_trans;
 881         }
 882
 883         ret = ocfs2_journal_access(handle, inode, bh,
 884                                    OCFS2_JOURNAL_ACCESS_WRITE);
 885         if (ret < 0) {
 886                 mlog_errno(ret);
 887                 goto out_bh;
 888         }
 889
 890         inode->i_mode &= ~S_ISUID;
 891         if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
 892                 inode->i_mode &= ~S_ISGID;
 893
 894         di = (struct ocfs2_dinode *) bh->b_data;
 895         di->i_mode = cpu_to_le16(inode->i_mode);
 896
 897         ret = ocfs2_journal_dirty(handle, bh);
 898         if (ret < 0)
 899                 mlog_errno(ret);
 900 out_bh:
 901         brelse(bh);
 902 out_trans:
 903         ocfs2_commit_trans(handle);
 904 out:
 905         mlog_exit(ret);
 906         return ret;
 907 }
 908
 909 static inline int ocfs2_write_should_remove_suid(struct inode *inode)
 910 {
 911         mode_t mode = inode->i_mode;
 912
 913         if (!capable(CAP_FSETID)) {
 914                 if (unlikely(mode & S_ISUID))
 915                         return 1;
 916
 917                 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
 918                         return 1;
 919         }
 920         return 0;
 921 }
 922
 923 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 924                                     const char __user *buf,
 925                                     size_t count,
 926                                     loff_t pos)
 927 {
 928         struct iovec local_iov = { .iov_base = (void __user *)buf,
 929                                    .iov_len = count };
 930         int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
 931         u32 clusters;
 932         struct file *filp = iocb->ki_filp;
 933         struct inode *inode = filp->f_dentry->d_inode;
 934         loff_t newsize, saved_pos;
 935 #ifdef OCFS2_ORACORE_WORKAROUNDS
 936         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 937 #endif
 938
 939         mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
 940                    (unsigned int)count,
 941                    filp->f_dentry->d_name.len,
 942                    filp->f_dentry->d_name.name);
 943
 944         /* happy write of zero bytes */
 945         if (count == 0)
 946                 return 0;
 947
 948         if (!inode) {
 949                 mlog(0, "bad inode\n");
 950                 return -EIO;
 951         }
 952
 953 #ifdef OCFS2_ORACORE_WORKAROUNDS
 954         /* ugh, work around some applications which open everything O_DIRECT +
 955          * O_APPEND and really don't mean to use O_DIRECT. */
 956         if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
 957             (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT))
 958                 filp->f_flags &= ~O_DIRECT;
 959 #endif
 960
 961         mutex_lock(&inode->i_mutex);
 962         /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
 963         if (filp->f_flags & O_DIRECT) {
 964                 have_alloc_sem = 1;
 965                 down_read(&inode->i_alloc_sem);
 966         }
 967
 968         /* concurrent O_DIRECT writes are allowed */
 969         rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
 970         ret = ocfs2_rw_lock(inode, rw_level);
 971         if (ret < 0) {
 972                 rw_level = -1;
 973                 mlog_errno(ret);
 974                 goto out;
 975         }
 976
 977         /*
 978          * We sample i_size under a read level meta lock to see if our write
 979          * is extending the file, if it is we back off and get a write level
 980          * meta lock.
 981          */
 982         meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
 983         for(;;) {
 984                 ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
 985                 if (ret < 0) {
 986                         meta_level = -1;
 987                         mlog_errno(ret);
 988                         goto out;
 989                 }
 990
 991                 /* Clear suid / sgid if necessary. We do this here
 992                  * instead of later in the write path because
 993                  * remove_suid() calls ->setattr without any hint that
 994                  * we may have already done our cluster locking. Since
 995                  * ocfs2_setattr() *must* take cluster locks to
 996                  * proceeed, this will lead us to recursively lock the
 997                  * inode. There's also the dinode i_size state which
 998                  * can be lost via setattr during extending writes (we
 999                  * set inode->i_size at the end of a write. */
1000                 if (ocfs2_write_should_remove_suid(inode)) {
1001                         if (meta_level == 0) {
1002                                 ocfs2_meta_unlock(inode, meta_level);
1003                                 meta_level = 1;
1004                                 continue;
1005                         }
1006
1007                         ret = ocfs2_write_remove_suid(inode);
1008                         if (ret < 0) {
1009                                 mlog_errno(ret);
1010                                 goto out;
1011                         }
1012                 }
1013
1014                 /* work on a copy of ppos until we're sure that we won't have
1015                  * to recalculate it due to relocking. */
1016                 if (filp->f_flags & O_APPEND) {
1017                         saved_pos = i_size_read(inode);
1018                         mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1019                 } else {
1020                         saved_pos = iocb->ki_pos;
1021                 }
1022                 newsize = count + saved_pos;
1023
1024                 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
1025                      saved_pos, newsize, i_size_read(inode));
1026
1027                 /* No need for a higher level metadata lock if we're
1028                  * never going past i_size. */
1029                 if (newsize <= i_size_read(inode))
1030                         break;
1031
1032                 if (meta_level == 0) {
1033                         ocfs2_meta_unlock(inode, meta_level);
1034                         meta_level = 1;
1035                         continue;
1036                 }
1037
1038                 spin_lock(&OCFS2_I(inode)->ip_lock);
1039                 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1040                         OCFS2_I(inode)->ip_clusters;
1041                 spin_unlock(&OCFS2_I(inode)->ip_lock);
1042
1043                 mlog(0, "Writing at EOF, may need more allocation: "
1044                      "i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
1045                      i_size_read(inode), newsize, clusters);
1046
1047                 /* We only want to continue the rest of this loop if
1048                  * our extend will actually require more
1049                  * allocation. */
1050                 if (!clusters)
1051                         break;
1052
1053                 ret = ocfs2_extend_allocation(inode, clusters);
1054                 if (ret < 0) {
1055                         if (ret != -ENOSPC)
1056                                 mlog_errno(ret);
1057                         goto out;
1058                 }
1059
1060                 /* Fill any holes which would've been created by this
1061                  * write. If we're O_APPEND, this will wind up
1062                  * (correctly) being a noop. */
1063                 ret = ocfs2_zero_extend(inode, (u64) newsize - count);
1064                 if (ret < 0) {
1065                         mlog_errno(ret);
1066                         goto out;
1067                 }
1068                 break;
1069         }
1070
1071         /* ok, we're done with i_size and alloc work */
1072         iocb->ki_pos = saved_pos;
1073         ocfs2_meta_unlock(inode, meta_level);
1074         meta_level = -1;
1075
1076         /* communicate with ocfs2_dio_end_io */
1077         ocfs2_iocb_set_rw_locked(iocb);
1078
1079 #ifdef OCFS2_ORACORE_WORKAROUNDS
1080         if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
1081             filp->f_flags & O_DIRECT) {
1082                 unsigned int saved_flags = filp->f_flags;
1083                 int sector_size = 1 << osb->s_sectsize_bits;
1084
1085                 if ((saved_pos & (sector_size - 1)) ||
1086                     (count & (sector_size - 1)) ||
1087                     ((unsigned long)buf & (sector_size - 1))) {
1088                         filp->f_flags |= O_SYNC;
1089                         filp->f_flags &= ~O_DIRECT;
1090                 }
1091
1092                 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1093                                                     &iocb->ki_pos);
1094
1095                 filp->f_flags = saved_flags;
1096         } else
1097 #endif
1098                 ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
1099                                                     &iocb->ki_pos);
1100
1101         /* buffered aio wouldn't have proper lock coverage today */
1102         BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1103
1104         /*
1105          * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1106          * function pointer which is called when o_direct io completes so that
1107          * it can unlock our rw lock.  (it's the clustered equivalent of
1108          * i_alloc_sem; protects truncate from racing with pending ios).
1109          * Unfortunately there are error cases which call end_io and others
1110          * that don't.  so we don't have to unlock the rw_lock if either an
1111          * async dio is going to do it in the future or an end_io after an
1112          * error has already done it.
1113          */
1114         if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1115                 rw_level = -1;
1116                 have_alloc_sem = 0;
1117         }
1118
1119 out:
1120         if (meta_level != -1)
1121                 ocfs2_meta_unlock(inode, meta_level);
1122         if (have_alloc_sem)
1123                 up_read(&inode->i_alloc_sem);
1124         if (rw_level != -1)
1125                 ocfs2_rw_unlock(inode, rw_level);
1126         mutex_unlock(&inode->i_mutex);
1127
1128         mlog_exit(ret);
1129         return ret;
1130 }
1131
1132 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1133                                    char __user *buf,
1134                                    size_t count,
1135                                    loff_t pos)
1136 {
1137         int ret = 0, rw_level = -1, have_alloc_sem = 0;
1138         struct file *filp = iocb->ki_filp;
1139         struct inode *inode = filp->f_dentry->d_inode;
1140 #ifdef OCFS2_ORACORE_WORKAROUNDS
1141         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1142 #endif
1143
1144         mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
1145                    (unsigned int)count,
1146                    filp->f_dentry->d_name.len,
1147                    filp->f_dentry->d_name.name);
1148
1149         if (!inode) {
1150                 ret = -EINVAL;
1151                 mlog_errno(ret);
1152                 goto bail;
1153         }
1154
1155 #ifdef OCFS2_ORACORE_WORKAROUNDS
1156         if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
1157                 if (filp->f_flags & O_DIRECT) {
1158                         int sector_size = 1 << osb->s_sectsize_bits;
1159
1160                         if ((pos & (sector_size - 1)) ||
1161                             (count & (sector_size - 1)) ||
1162                             ((unsigned long)buf & (sector_size - 1)) ||
1163                             (i_size_read(inode) & (sector_size -1))) {
1164                                 filp->f_flags &= ~O_DIRECT;
1165                         }
1166                 }
1167         }
1168 #endif
1169
1170         /*
1171          * buffered reads protect themselves in ->readpage().  O_DIRECT reads
1172          * need locks to protect pending reads from racing with truncate.
1173          */
1174         if (filp->f_flags & O_DIRECT) {
1175                 down_read(&inode->i_alloc_sem);
1176                 have_alloc_sem = 1;
1177
1178                 ret = ocfs2_rw_lock(inode, 0);
1179                 if (ret < 0) {
1180                         mlog_errno(ret);
1181                         goto bail;
1182                 }
1183                 rw_level = 0;
1184                 /* communicate with ocfs2_dio_end_io */
1185                 ocfs2_iocb_set_rw_locked(iocb);
1186         }
1187
1188         ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
1189         if (ret == -EINVAL)
1190                 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1191
1192         /* buffered aio wouldn't have proper lock coverage today */
1193         BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1194
1195         /* see ocfs2_file_aio_write */
1196         if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1197                 rw_level = -1;
1198                 have_alloc_sem = 0;
1199         }
1200
1201 bail:
1202         if (have_alloc_sem)
1203                 up_read(&inode->i_alloc_sem);
1204         if (rw_level != -1)
1205                 ocfs2_rw_unlock(inode, rw_level);
1206         mlog_exit(ret);
1207
1208         return ret;
1209 }
1210
1211 struct inode_operations ocfs2_file_iops = {
1212         .setattr        = ocfs2_setattr,
1213         .getattr        = ocfs2_getattr,
1214 };
1215
1216 struct inode_operations ocfs2_special_file_iops = {
1217         .setattr        = ocfs2_setattr,
1218         .getattr        = ocfs2_getattr,
1219 };
1220
1221 struct file_operations ocfs2_fops = {
1222         .read           = do_sync_read,
1223         .write          = do_sync_write,
1224         .sendfile       = generic_file_sendfile,
1225         .mmap           = ocfs2_mmap,
1226         .fsync          = ocfs2_sync_file,
1227         .release        = ocfs2_file_release,
1228         .open           = ocfs2_file_open,
1229         .aio_read       = ocfs2_file_aio_read,
1230         .aio_write      = ocfs2_file_aio_write,
1231 };
1232
1233 struct file_operations ocfs2_dops = {
1234         .read           = generic_read_dir,
1235         .readdir        = ocfs2_readdir,
1236         .fsync          = ocfs2_sync_file,
1237 };