writeback: remove pages_skipped accounting in __block_write_full_page()
[pandora-kernel.git] / fs / xfs / xfs_vnodeops.c
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_types.h"
22 #include "xfs_bit.h"
23 #include "xfs_log.h"
24 #include "xfs_inum.h"
25 #include "xfs_trans.h"
26 #include "xfs_sb.h"
27 #include "xfs_ag.h"
28 #include "xfs_dir2.h"
29 #include "xfs_dmapi.h"
30 #include "xfs_mount.h"
31 #include "xfs_da_btree.h"
32 #include "xfs_bmap_btree.h"
33 #include "xfs_alloc_btree.h"
34 #include "xfs_ialloc_btree.h"
35 #include "xfs_dir2_sf.h"
36 #include "xfs_attr_sf.h"
37 #include "xfs_dinode.h"
38 #include "xfs_inode.h"
39 #include "xfs_inode_item.h"
40 #include "xfs_itable.h"
41 #include "xfs_btree.h"
42 #include "xfs_ialloc.h"
43 #include "xfs_alloc.h"
44 #include "xfs_bmap.h"
45 #include "xfs_attr.h"
46 #include "xfs_rw.h"
47 #include "xfs_error.h"
48 #include "xfs_quota.h"
49 #include "xfs_utils.h"
50 #include "xfs_rtalloc.h"
51 #include "xfs_refcache.h"
52 #include "xfs_trans_space.h"
53 #include "xfs_log_priv.h"
54 #include "xfs_filestream.h"
55
56 STATIC int
57 xfs_open(
58         bhv_desc_t      *bdp,
59         cred_t          *credp)
60 {
61         int             mode;
62         bhv_vnode_t     *vp = BHV_TO_VNODE(bdp);
63         xfs_inode_t     *ip = XFS_BHVTOI(bdp);
64
65         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
66                 return XFS_ERROR(EIO);
67
68         /*
69          * If it's a directory with any blocks, read-ahead block 0
70          * as we're almost certain to have the next operation be a read there.
71          */
72         if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
73                 mode = xfs_ilock_map_shared(ip);
74                 if (ip->i_d.di_nextents > 0)
75                         (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
76                 xfs_iunlock(ip, mode);
77         }
78         return 0;
79 }
80
81 /*
82  * xfs_getattr
83  */
84 STATIC int
85 xfs_getattr(
86         bhv_desc_t      *bdp,
87         bhv_vattr_t     *vap,
88         int             flags,
89         cred_t          *credp)
90 {
91         xfs_inode_t     *ip;
92         xfs_mount_t     *mp;
93         bhv_vnode_t     *vp;
94
95         vp  = BHV_TO_VNODE(bdp);
96         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
97
98         ip = XFS_BHVTOI(bdp);
99         mp = ip->i_mount;
100
101         if (XFS_FORCED_SHUTDOWN(mp))
102                 return XFS_ERROR(EIO);
103
104         if (!(flags & ATTR_LAZY))
105                 xfs_ilock(ip, XFS_ILOCK_SHARED);
106
107         vap->va_size = XFS_ISIZE(ip);
108         if (vap->va_mask == XFS_AT_SIZE)
109                 goto all_done;
110
111         vap->va_nblocks =
112                 XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
113         vap->va_nodeid = ip->i_ino;
114 #if XFS_BIG_INUMS
115         vap->va_nodeid += mp->m_inoadd;
116 #endif
117         vap->va_nlink = ip->i_d.di_nlink;
118
119         /*
120          * Quick exit for non-stat callers
121          */
122         if ((vap->va_mask &
123             ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID|
124               XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0)
125                 goto all_done;
126
127         /*
128          * Copy from in-core inode.
129          */
130         vap->va_mode = ip->i_d.di_mode;
131         vap->va_uid = ip->i_d.di_uid;
132         vap->va_gid = ip->i_d.di_gid;
133         vap->va_projid = ip->i_d.di_projid;
134
135         /*
136          * Check vnode type block/char vs. everything else.
137          */
138         switch (ip->i_d.di_mode & S_IFMT) {
139         case S_IFBLK:
140         case S_IFCHR:
141                 vap->va_rdev = ip->i_df.if_u2.if_rdev;
142                 vap->va_blocksize = BLKDEV_IOSIZE;
143                 break;
144         default:
145                 vap->va_rdev = 0;
146
147                 if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
148                         vap->va_blocksize = xfs_preferred_iosize(mp);
149                 } else {
150
151                         /*
152                          * If the file blocks are being allocated from a
153                          * realtime partition, then return the inode's
154                          * realtime extent size or the realtime volume's
155                          * extent size.
156                          */
157                         vap->va_blocksize =
158                                 xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
159                 }
160                 break;
161         }
162
163         vn_atime_to_timespec(vp, &vap->va_atime);
164         vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
165         vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
166         vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
167         vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
168
169         /*
170          * Exit for stat callers.  See if any of the rest of the fields
171          * to be filled in are needed.
172          */
173         if ((vap->va_mask &
174              (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
175               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
176                 goto all_done;
177
178         /*
179          * Convert di_flags to xflags.
180          */
181         vap->va_xflags = xfs_ip2xflags(ip);
182
183         /*
184          * Exit for inode revalidate.  See if any of the rest of
185          * the fields to be filled in are needed.
186          */
187         if ((vap->va_mask &
188              (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|
189               XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0)
190                 goto all_done;
191
192         vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog;
193         vap->va_nextents =
194                 (ip->i_df.if_flags & XFS_IFEXTENTS) ?
195                         ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) :
196                         ip->i_d.di_nextents;
197         if (ip->i_afp)
198                 vap->va_anextents =
199                         (ip->i_afp->if_flags & XFS_IFEXTENTS) ?
200                                 ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) :
201                                  ip->i_d.di_anextents;
202         else
203                 vap->va_anextents = 0;
204         vap->va_gen = ip->i_d.di_gen;
205
206  all_done:
207         if (!(flags & ATTR_LAZY))
208                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
209         return 0;
210 }
211
212
213 /*
214  * xfs_setattr
215  */
216 int
217 xfs_setattr(
218         bhv_desc_t              *bdp,
219         bhv_vattr_t             *vap,
220         int                     flags,
221         cred_t                  *credp)
222 {
223         xfs_inode_t             *ip;
224         xfs_trans_t             *tp;
225         xfs_mount_t             *mp;
226         int                     mask;
227         int                     code;
228         uint                    lock_flags;
229         uint                    commit_flags=0;
230         uid_t                   uid=0, iuid=0;
231         gid_t                   gid=0, igid=0;
232         int                     timeflags = 0;
233         bhv_vnode_t             *vp;
234         xfs_prid_t              projid=0, iprojid=0;
235         int                     mandlock_before, mandlock_after;
236         struct xfs_dquot        *udqp, *gdqp, *olddquot1, *olddquot2;
237         int                     file_owner;
238         int                     need_iolock = 1;
239
240         vp = BHV_TO_VNODE(bdp);
241         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
242
243         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
244                 return XFS_ERROR(EROFS);
245
246         /*
247          * Cannot set certain attributes.
248          */
249         mask = vap->va_mask;
250         if (mask & XFS_AT_NOSET) {
251                 return XFS_ERROR(EINVAL);
252         }
253
254         ip = XFS_BHVTOI(bdp);
255         mp = ip->i_mount;
256
257         if (XFS_FORCED_SHUTDOWN(mp))
258                 return XFS_ERROR(EIO);
259
260         /*
261          * Timestamps do not need to be logged and hence do not
262          * need to be done within a transaction.
263          */
264         if (mask & XFS_AT_UPDTIMES) {
265                 ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
266                 timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
267                             ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
268                             ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
269                 xfs_ichgtime(ip, timeflags);
270                 return 0;
271         }
272
273         olddquot1 = olddquot2 = NULL;
274         udqp = gdqp = NULL;
275
276         /*
277          * If disk quotas is on, we make sure that the dquots do exist on disk,
278          * before we start any other transactions. Trying to do this later
279          * is messy. We don't care to take a readlock to look at the ids
280          * in inode here, because we can't hold it across the trans_reserve.
281          * If the IDs do change before we take the ilock, we're covered
282          * because the i_*dquot fields will get updated anyway.
283          */
284         if (XFS_IS_QUOTA_ON(mp) &&
285             (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
286                 uint    qflags = 0;
287
288                 if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
289                         uid = vap->va_uid;
290                         qflags |= XFS_QMOPT_UQUOTA;
291                 } else {
292                         uid = ip->i_d.di_uid;
293                 }
294                 if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
295                         gid = vap->va_gid;
296                         qflags |= XFS_QMOPT_GQUOTA;
297                 }  else {
298                         gid = ip->i_d.di_gid;
299                 }
300                 if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
301                         projid = vap->va_projid;
302                         qflags |= XFS_QMOPT_PQUOTA;
303                 }  else {
304                         projid = ip->i_d.di_projid;
305                 }
306                 /*
307                  * We take a reference when we initialize udqp and gdqp,
308                  * so it is important that we never blindly double trip on
309                  * the same variable. See xfs_create() for an example.
310                  */
311                 ASSERT(udqp == NULL);
312                 ASSERT(gdqp == NULL);
313                 code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
314                                          &udqp, &gdqp);
315                 if (code)
316                         return code;
317         }
318
319         /*
320          * For the other attributes, we acquire the inode lock and
321          * first do an error checking pass.
322          */
323         tp = NULL;
324         lock_flags = XFS_ILOCK_EXCL;
325         if (flags & ATTR_NOLOCK)
326                 need_iolock = 0;
327         if (!(mask & XFS_AT_SIZE)) {
328                 if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
329                     (mp->m_flags & XFS_MOUNT_WSYNC)) {
330                         tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
331                         commit_flags = 0;
332                         if ((code = xfs_trans_reserve(tp, 0,
333                                                      XFS_ICHANGE_LOG_RES(mp), 0,
334                                                      0, 0))) {
335                                 lock_flags = 0;
336                                 goto error_return;
337                         }
338                 }
339         } else {
340                 if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) &&
341                     !(flags & ATTR_DMI)) {
342                         int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
343                         code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp,
344                                 vap->va_size, 0, dmflags, NULL);
345                         if (code) {
346                                 lock_flags = 0;
347                                 goto error_return;
348                         }
349                 }
350                 if (need_iolock)
351                         lock_flags |= XFS_IOLOCK_EXCL;
352         }
353
354         xfs_ilock(ip, lock_flags);
355
356         /* boolean: are we the file owner? */
357         file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
358
359         /*
360          * Change various properties of a file.
361          * Only the owner or users with CAP_FOWNER
362          * capability may do these things.
363          */
364         if (mask &
365             (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
366              XFS_AT_GID|XFS_AT_PROJID)) {
367                 /*
368                  * CAP_FOWNER overrides the following restrictions:
369                  *
370                  * The user ID of the calling process must be equal
371                  * to the file owner ID, except in cases where the
372                  * CAP_FSETID capability is applicable.
373                  */
374                 if (!file_owner && !capable(CAP_FOWNER)) {
375                         code = XFS_ERROR(EPERM);
376                         goto error_return;
377                 }
378
379                 /*
380                  * CAP_FSETID overrides the following restrictions:
381                  *
382                  * The effective user ID of the calling process shall match
383                  * the file owner when setting the set-user-ID and
384                  * set-group-ID bits on that file.
385                  *
386                  * The effective group ID or one of the supplementary group
387                  * IDs of the calling process shall match the group owner of
388                  * the file when setting the set-group-ID bit on that file
389                  */
390                 if (mask & XFS_AT_MODE) {
391                         mode_t m = 0;
392
393                         if ((vap->va_mode & S_ISUID) && !file_owner)
394                                 m |= S_ISUID;
395                         if ((vap->va_mode & S_ISGID) &&
396                             !in_group_p((gid_t)ip->i_d.di_gid))
397                                 m |= S_ISGID;
398 #if 0
399                         /* Linux allows this, Irix doesn't. */
400                         if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
401                                 m |= S_ISVTX;
402 #endif
403                         if (m && !capable(CAP_FSETID))
404                                 vap->va_mode &= ~m;
405                 }
406         }
407
408         /*
409          * Change file ownership.  Must be the owner or privileged.
410          * If the system was configured with the "restricted_chown"
411          * option, the owner is not permitted to give away the file,
412          * and can change the group id only to a group of which he
413          * or she is a member.
414          */
415         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
416                 /*
417                  * These IDs could have changed since we last looked at them.
418                  * But, we're assured that if the ownership did change
419                  * while we didn't have the inode locked, inode's dquot(s)
420                  * would have changed also.
421                  */
422                 iuid = ip->i_d.di_uid;
423                 iprojid = ip->i_d.di_projid;
424                 igid = ip->i_d.di_gid;
425                 gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
426                 uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
427                 projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
428                          iprojid;
429
430                 /*
431                  * CAP_CHOWN overrides the following restrictions:
432                  *
433                  * If _POSIX_CHOWN_RESTRICTED is defined, this capability
434                  * shall override the restriction that a process cannot
435                  * change the user ID of a file it owns and the restriction
436                  * that the group ID supplied to the chown() function
437                  * shall be equal to either the group ID or one of the
438                  * supplementary group IDs of the calling process.
439                  */
440                 if (restricted_chown &&
441                     (iuid != uid || (igid != gid &&
442                                      !in_group_p((gid_t)gid))) &&
443                     !capable(CAP_CHOWN)) {
444                         code = XFS_ERROR(EPERM);
445                         goto error_return;
446                 }
447                 /*
448                  * Do a quota reservation only if uid/projid/gid is actually
449                  * going to change.
450                  */
451                 if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
452                     (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
453                     (XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
454                         ASSERT(tp);
455                         code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
456                                                 capable(CAP_FOWNER) ?
457                                                 XFS_QMOPT_FORCE_RES : 0);
458                         if (code)       /* out of quota */
459                                 goto error_return;
460                 }
461         }
462
463         /*
464          * Truncate file.  Must have write permission and not be a directory.
465          */
466         if (mask & XFS_AT_SIZE) {
467                 /* Short circuit the truncate case for zero length files */
468                 if ((vap->va_size == 0) &&
469                    (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
470                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
471                         lock_flags &= ~XFS_ILOCK_EXCL;
472                         if (mask & XFS_AT_CTIME)
473                                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
474                         code = 0;
475                         goto error_return;
476                 }
477
478                 if (VN_ISDIR(vp)) {
479                         code = XFS_ERROR(EISDIR);
480                         goto error_return;
481                 } else if (!VN_ISREG(vp)) {
482                         code = XFS_ERROR(EINVAL);
483                         goto error_return;
484                 }
485                 /*
486                  * Make sure that the dquots are attached to the inode.
487                  */
488                 if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
489                         goto error_return;
490         }
491
492         /*
493          * Change file access or modified times.
494          */
495         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
496                 if (!file_owner) {
497                         if ((flags & ATTR_UTIME) &&
498                             !capable(CAP_FOWNER)) {
499                                 code = XFS_ERROR(EPERM);
500                                 goto error_return;
501                         }
502                 }
503         }
504
505         /*
506          * Change extent size or realtime flag.
507          */
508         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
509                 /*
510                  * Can't change extent size if any extents are allocated.
511                  */
512                 if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
513                     ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
514                      vap->va_extsize) ) {
515                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
516                         goto error_return;
517                 }
518
519                 /*
520                  * Can't change realtime flag if any extents are allocated.
521                  */
522                 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
523                     (mask & XFS_AT_XFLAGS) &&
524                     (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) !=
525                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
526                         code = XFS_ERROR(EINVAL);       /* EFBIG? */
527                         goto error_return;
528                 }
529                 /*
530                  * Extent size must be a multiple of the appropriate block
531                  * size, if set at all.
532                  */
533                 if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
534                         xfs_extlen_t    size;
535
536                         if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ||
537                             ((mask & XFS_AT_XFLAGS) &&
538                             (vap->va_xflags & XFS_XFLAG_REALTIME))) {
539                                 size = mp->m_sb.sb_rextsize <<
540                                        mp->m_sb.sb_blocklog;
541                         } else {
542                                 size = mp->m_sb.sb_blocksize;
543                         }
544                         if (vap->va_extsize % size) {
545                                 code = XFS_ERROR(EINVAL);
546                                 goto error_return;
547                         }
548                 }
549                 /*
550                  * If realtime flag is set then must have realtime data.
551                  */
552                 if ((mask & XFS_AT_XFLAGS) &&
553                     (vap->va_xflags & XFS_XFLAG_REALTIME)) {
554                         if ((mp->m_sb.sb_rblocks == 0) ||
555                             (mp->m_sb.sb_rextsize == 0) ||
556                             (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
557                                 code = XFS_ERROR(EINVAL);
558                                 goto error_return;
559                         }
560                 }
561
562                 /*
563                  * Can't modify an immutable/append-only file unless
564                  * we have appropriate permission.
565                  */
566                 if ((mask & XFS_AT_XFLAGS) &&
567                     (ip->i_d.di_flags &
568                                 (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
569                      (vap->va_xflags &
570                                 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
571                     !capable(CAP_LINUX_IMMUTABLE)) {
572                         code = XFS_ERROR(EPERM);
573                         goto error_return;
574                 }
575         }
576
577         /*
578          * Now we can make the changes.  Before we join the inode
579          * to the transaction, if XFS_AT_SIZE is set then take care of
580          * the part of the truncation that must be done without the
581          * inode lock.  This needs to be done before joining the inode
582          * to the transaction, because the inode cannot be unlocked
583          * once it is a part of the transaction.
584          */
585         if (mask & XFS_AT_SIZE) {
586                 code = 0;
587                 if ((vap->va_size > ip->i_size) &&
588                     (flags & ATTR_NOSIZETOK) == 0) {
589                         code = xfs_igrow_start(ip, vap->va_size, credp);
590                 }
591                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
592
593                 /*
594                  * We are going to log the inode size change in this
595                  * transaction so any previous writes that are beyond the on
596                  * disk EOF and the new EOF that have not been written out need
597                  * to be written here. If we do not write the data out, we
598                  * expose ourselves to the null files problem.
599                  *
600                  * Only flush from the on disk size to the smaller of the in
601                  * memory file size or the new size as that's the range we
602                  * really care about here and prevents waiting for other data
603                  * not within the range we care about here.
604                  */
605                 if (!code &&
606                     (ip->i_size != ip->i_d.di_size) &&
607                     (vap->va_size > ip->i_d.di_size)) {
608                         code = bhv_vop_flush_pages(XFS_ITOV(ip),
609                                         ip->i_d.di_size, vap->va_size,
610                                         XFS_B_ASYNC, FI_NONE);
611                 }
612
613                 /* wait for all I/O to complete */
614                 vn_iowait(vp);
615
616                 if (!code)
617                         code = xfs_itruncate_data(ip, vap->va_size);
618                 if (code) {
619                         ASSERT(tp == NULL);
620                         lock_flags &= ~XFS_ILOCK_EXCL;
621                         ASSERT(lock_flags == XFS_IOLOCK_EXCL);
622                         goto error_return;
623                 }
624                 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
625                 if ((code = xfs_trans_reserve(tp, 0,
626                                              XFS_ITRUNCATE_LOG_RES(mp), 0,
627                                              XFS_TRANS_PERM_LOG_RES,
628                                              XFS_ITRUNCATE_LOG_COUNT))) {
629                         xfs_trans_cancel(tp, 0);
630                         if (need_iolock)
631                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
632                         return code;
633                 }
634                 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
635                 xfs_ilock(ip, XFS_ILOCK_EXCL);
636         }
637
638         if (tp) {
639                 xfs_trans_ijoin(tp, ip, lock_flags);
640                 xfs_trans_ihold(tp, ip);
641         }
642
643         /* determine whether mandatory locking mode changes */
644         mandlock_before = MANDLOCK(vp, ip->i_d.di_mode);
645
646         /*
647          * Truncate file.  Must have write permission and not be a directory.
648          */
649         if (mask & XFS_AT_SIZE) {
650                 if (vap->va_size > ip->i_size) {
651                         xfs_igrow_finish(tp, ip, vap->va_size,
652                             !(flags & ATTR_DMI));
653                 } else if ((vap->va_size <= ip->i_size) ||
654                            ((vap->va_size == 0) && ip->i_d.di_nextents)) {
655                         /*
656                          * signal a sync transaction unless
657                          * we're truncating an already unlinked
658                          * file on a wsync filesystem
659                          */
660                         code = xfs_itruncate_finish(&tp, ip,
661                                             (xfs_fsize_t)vap->va_size,
662                                             XFS_DATA_FORK,
663                                             ((ip->i_d.di_nlink != 0 ||
664                                               !(mp->m_flags & XFS_MOUNT_WSYNC))
665                                              ? 1 : 0));
666                         if (code)
667                                 goto abort_return;
668                         /*
669                          * Truncated "down", so we're removing references
670                          * to old data here - if we now delay flushing for
671                          * a long time, we expose ourselves unduly to the
672                          * notorious NULL files problem.  So, we mark this
673                          * vnode and flush it when the file is closed, and
674                          * do not wait the usual (long) time for writeout.
675                          */
676                         VTRUNCATE(vp);
677                 }
678                 /*
679                  * Have to do this even if the file's size doesn't change.
680                  */
681                 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
682         }
683
684         /*
685          * Change file access modes.
686          */
687         if (mask & XFS_AT_MODE) {
688                 ip->i_d.di_mode &= S_IFMT;
689                 ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
690
691                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
692                 timeflags |= XFS_ICHGTIME_CHG;
693         }
694
695         /*
696          * Change file ownership.  Must be the owner or privileged.
697          * If the system was configured with the "restricted_chown"
698          * option, the owner is not permitted to give away the file,
699          * and can change the group id only to a group of which he
700          * or she is a member.
701          */
702         if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
703                 /*
704                  * CAP_FSETID overrides the following restrictions:
705                  *
706                  * The set-user-ID and set-group-ID bits of a file will be
707                  * cleared upon successful return from chown()
708                  */
709                 if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
710                     !capable(CAP_FSETID)) {
711                         ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
712                 }
713
714                 /*
715                  * Change the ownerships and register quota modifications
716                  * in the transaction.
717                  */
718                 if (iuid != uid) {
719                         if (XFS_IS_UQUOTA_ON(mp)) {
720                                 ASSERT(mask & XFS_AT_UID);
721                                 ASSERT(udqp);
722                                 olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
723                                                         &ip->i_udquot, udqp);
724                         }
725                         ip->i_d.di_uid = uid;
726                 }
727                 if (igid != gid) {
728                         if (XFS_IS_GQUOTA_ON(mp)) {
729                                 ASSERT(!XFS_IS_PQUOTA_ON(mp));
730                                 ASSERT(mask & XFS_AT_GID);
731                                 ASSERT(gdqp);
732                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
733                                                         &ip->i_gdquot, gdqp);
734                         }
735                         ip->i_d.di_gid = gid;
736                 }
737                 if (iprojid != projid) {
738                         if (XFS_IS_PQUOTA_ON(mp)) {
739                                 ASSERT(!XFS_IS_GQUOTA_ON(mp));
740                                 ASSERT(mask & XFS_AT_PROJID);
741                                 ASSERT(gdqp);
742                                 olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
743                                                         &ip->i_gdquot, gdqp);
744                         }
745                         ip->i_d.di_projid = projid;
746                         /*
747                          * We may have to rev the inode as well as
748                          * the superblock version number since projids didn't
749                          * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
750                          */
751                         if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
752                                 xfs_bump_ino_vers2(tp, ip);
753                 }
754
755                 xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
756                 timeflags |= XFS_ICHGTIME_CHG;
757         }
758
759
760         /*
761          * Change file access or modified times.
762          */
763         if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
764                 if (mask & XFS_AT_ATIME) {
765                         ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
766                         ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
767                         ip->i_update_core = 1;
768                         timeflags &= ~XFS_ICHGTIME_ACC;
769                 }
770                 if (mask & XFS_AT_MTIME) {
771                         ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
772                         ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
773                         timeflags &= ~XFS_ICHGTIME_MOD;
774                         timeflags |= XFS_ICHGTIME_CHG;
775                 }
776                 if (tp && (flags & ATTR_UTIME))
777                         xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
778         }
779
780         /*
781          * Change XFS-added attributes.
782          */
783         if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
784                 if (mask & XFS_AT_EXTSIZE) {
785                         /*
786                          * Converting bytes to fs blocks.
787                          */
788                         ip->i_d.di_extsize = vap->va_extsize >>
789                                 mp->m_sb.sb_blocklog;
790                 }
791                 if (mask & XFS_AT_XFLAGS) {
792                         uint    di_flags;
793
794                         /* can't set PREALLOC this way, just preserve it */
795                         di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
796                         if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
797                                 di_flags |= XFS_DIFLAG_IMMUTABLE;
798                         if (vap->va_xflags & XFS_XFLAG_APPEND)
799                                 di_flags |= XFS_DIFLAG_APPEND;
800                         if (vap->va_xflags & XFS_XFLAG_SYNC)
801                                 di_flags |= XFS_DIFLAG_SYNC;
802                         if (vap->va_xflags & XFS_XFLAG_NOATIME)
803                                 di_flags |= XFS_DIFLAG_NOATIME;
804                         if (vap->va_xflags & XFS_XFLAG_NODUMP)
805                                 di_flags |= XFS_DIFLAG_NODUMP;
806                         if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
807                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
808                         if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
809                                 di_flags |= XFS_DIFLAG_NODEFRAG;
810                         if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
811                                 di_flags |= XFS_DIFLAG_FILESTREAM;
812                         if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
813                                 if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
814                                         di_flags |= XFS_DIFLAG_RTINHERIT;
815                                 if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
816                                         di_flags |= XFS_DIFLAG_NOSYMLINKS;
817                                 if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
818                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
819                         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
820                                 if (vap->va_xflags & XFS_XFLAG_REALTIME) {
821                                         di_flags |= XFS_DIFLAG_REALTIME;
822                                         ip->i_iocore.io_flags |= XFS_IOCORE_RT;
823                                 } else {
824                                         ip->i_iocore.io_flags &= ~XFS_IOCORE_RT;
825                                 }
826                                 if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
827                                         di_flags |= XFS_DIFLAG_EXTSIZE;
828                         }
829                         ip->i_d.di_flags = di_flags;
830                 }
831                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
832                 timeflags |= XFS_ICHGTIME_CHG;
833         }
834
835         /*
836          * Change file inode change time only if XFS_AT_CTIME set
837          * AND we have been called by a DMI function.
838          */
839
840         if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
841                 ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
842                 ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
843                 ip->i_update_core = 1;
844                 timeflags &= ~XFS_ICHGTIME_CHG;
845         }
846
847         /*
848          * Send out timestamp changes that need to be set to the
849          * current time.  Not done when called by a DMI function.
850          */
851         if (timeflags && !(flags & ATTR_DMI))
852                 xfs_ichgtime(ip, timeflags);
853
854         XFS_STATS_INC(xs_ig_attrchg);
855
856         /*
857          * If this is a synchronous mount, make sure that the
858          * transaction goes to disk before returning to the user.
859          * This is slightly sub-optimal in that truncates require
860          * two sync transactions instead of one for wsync filesystems.
861          * One for the truncate and one for the timestamps since we
862          * don't want to change the timestamps unless we're sure the
863          * truncate worked.  Truncates are less than 1% of the laddis
864          * mix so this probably isn't worth the trouble to optimize.
865          */
866         code = 0;
867         if (tp) {
868                 if (mp->m_flags & XFS_MOUNT_WSYNC)
869                         xfs_trans_set_sync(tp);
870
871                 code = xfs_trans_commit(tp, commit_flags);
872         }
873
874         /*
875          * If the (regular) file's mandatory locking mode changed, then
876          * notify the vnode.  We do this under the inode lock to prevent
877          * racing calls to vop_vnode_change.
878          */
879         mandlock_after = MANDLOCK(vp, ip->i_d.di_mode);
880         if (mandlock_before != mandlock_after) {
881                 bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING,
882                                  mandlock_after);
883         }
884
885         xfs_iunlock(ip, lock_flags);
886
887         /*
888          * Release any dquot(s) the inode had kept before chown.
889          */
890         XFS_QM_DQRELE(mp, olddquot1);
891         XFS_QM_DQRELE(mp, olddquot2);
892         XFS_QM_DQRELE(mp, udqp);
893         XFS_QM_DQRELE(mp, gdqp);
894
895         if (code) {
896                 return code;
897         }
898
899         if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) &&
900             !(flags & ATTR_DMI)) {
901                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL,
902                                         NULL, DM_RIGHT_NULL, NULL, NULL,
903                                         0, 0, AT_DELAY_FLAG(flags));
904         }
905         return 0;
906
907  abort_return:
908         commit_flags |= XFS_TRANS_ABORT;
909         /* FALLTHROUGH */
910  error_return:
911         XFS_QM_DQRELE(mp, udqp);
912         XFS_QM_DQRELE(mp, gdqp);
913         if (tp) {
914                 xfs_trans_cancel(tp, commit_flags);
915         }
916         if (lock_flags != 0) {
917                 xfs_iunlock(ip, lock_flags);
918         }
919         return code;
920 }
921
922
923 /*
924  * xfs_access
925  * Null conversion from vnode mode bits to inode mode bits, as in efs.
926  */
927 STATIC int
928 xfs_access(
929         bhv_desc_t      *bdp,
930         int             mode,
931         cred_t          *credp)
932 {
933         xfs_inode_t     *ip;
934         int             error;
935
936         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
937                                                (inst_t *)__return_address);
938
939         ip = XFS_BHVTOI(bdp);
940         xfs_ilock(ip, XFS_ILOCK_SHARED);
941         error = xfs_iaccess(ip, mode, credp);
942         xfs_iunlock(ip, XFS_ILOCK_SHARED);
943         return error;
944 }
945
946
947 /*
948  * The maximum pathlen is 1024 bytes. Since the minimum file system
949  * blocksize is 512 bytes, we can get a max of 2 extents back from
950  * bmapi.
951  */
952 #define SYMLINK_MAPS 2
953
954 /*
955  * xfs_readlink
956  *
957  */
958 STATIC int
959 xfs_readlink(
960         bhv_desc_t      *bdp,
961         uio_t           *uiop,
962         int             ioflags,
963         cred_t          *credp)
964 {
965         xfs_inode_t     *ip;
966         int             count;
967         xfs_off_t       offset;
968         int             pathlen;
969         bhv_vnode_t     *vp;
970         int             error = 0;
971         xfs_mount_t     *mp;
972         int             nmaps;
973         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
974         xfs_daddr_t     d;
975         int             byte_cnt;
976         int             n;
977         xfs_buf_t       *bp;
978
979         vp = BHV_TO_VNODE(bdp);
980         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
981
982         ip = XFS_BHVTOI(bdp);
983         mp = ip->i_mount;
984
985         if (XFS_FORCED_SHUTDOWN(mp))
986                 return XFS_ERROR(EIO);
987
988         xfs_ilock(ip, XFS_ILOCK_SHARED);
989
990         ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK);
991
992         offset = uiop->uio_offset;
993         count = uiop->uio_resid;
994
995         if (offset < 0) {
996                 error = XFS_ERROR(EINVAL);
997                 goto error_return;
998         }
999         if (count <= 0) {
1000                 error = 0;
1001                 goto error_return;
1002         }
1003
1004         /*
1005          * See if the symlink is stored inline.
1006          */
1007         pathlen = (int)ip->i_d.di_size;
1008
1009         if (ip->i_df.if_flags & XFS_IFINLINE) {
1010                 error = xfs_uio_read(ip->i_df.if_u1.if_data, pathlen, uiop);
1011         }
1012         else {
1013                 /*
1014                  * Symlink not inline.  Call bmap to get it in.
1015                  */
1016                 nmaps = SYMLINK_MAPS;
1017
1018                 error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen),
1019                                   0, NULL, 0, mval, &nmaps, NULL, NULL);
1020
1021                 if (error) {
1022                         goto error_return;
1023                 }
1024
1025                 for (n = 0; n < nmaps; n++) {
1026                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
1027                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
1028                         bp = xfs_buf_read(mp->m_ddev_targp, d,
1029                                       BTOBB(byte_cnt), 0);
1030                         error = XFS_BUF_GETERROR(bp);
1031                         if (error) {
1032                                 xfs_ioerror_alert("xfs_readlink",
1033                                           ip->i_mount, bp, XFS_BUF_ADDR(bp));
1034                                 xfs_buf_relse(bp);
1035                                 goto error_return;
1036                         }
1037                         if (pathlen < byte_cnt)
1038                                 byte_cnt = pathlen;
1039                         pathlen -= byte_cnt;
1040
1041                         error = xfs_uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop);
1042                         xfs_buf_relse (bp);
1043                 }
1044
1045         }
1046
1047 error_return:
1048         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1049         return error;
1050 }
1051
1052
1053 /*
1054  * xfs_fsync
1055  *
1056  * This is called to sync the inode and its data out to disk.
1057  * We need to hold the I/O lock while flushing the data, and
1058  * the inode lock while flushing the inode.  The inode lock CANNOT
1059  * be held while flushing the data, so acquire after we're done
1060  * with that.
1061  */
1062 STATIC int
1063 xfs_fsync(
1064         bhv_desc_t      *bdp,
1065         int             flag,
1066         cred_t          *credp,
1067         xfs_off_t       start,
1068         xfs_off_t       stop)
1069 {
1070         xfs_inode_t     *ip;
1071         xfs_trans_t     *tp;
1072         int             error;
1073         int             log_flushed = 0, changed = 1;
1074
1075         vn_trace_entry(BHV_TO_VNODE(bdp),
1076                         __FUNCTION__, (inst_t *)__return_address);
1077
1078         ip = XFS_BHVTOI(bdp);
1079
1080         ASSERT(start >= 0 && stop >= -1);
1081
1082         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1083                 return XFS_ERROR(EIO);
1084
1085         if (flag & FSYNC_DATA)
1086                 filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
1087
1088         /*
1089          * We always need to make sure that the required inode state
1090          * is safe on disk.  The vnode might be clean but because
1091          * of committed transactions that haven't hit the disk yet.
1092          * Likewise, there could be unflushed non-transactional
1093          * changes to the inode core that have to go to disk.
1094          *
1095          * The following code depends on one assumption:  that
1096          * any transaction that changes an inode logs the core
1097          * because it has to change some field in the inode core
1098          * (typically nextents or nblocks).  That assumption
1099          * implies that any transactions against an inode will
1100          * catch any non-transactional updates.  If inode-altering
1101          * transactions exist that violate this assumption, the
1102          * code breaks.  Right now, it figures that if the involved
1103          * update_* field is clear and the inode is unpinned, the
1104          * inode is clean.  Either it's been flushed or it's been
1105          * committed and the commit has hit the disk unpinning the inode.
1106          * (Note that xfs_inode_item_format() called at commit clears
1107          * the update_* fields.)
1108          */
1109         xfs_ilock(ip, XFS_ILOCK_SHARED);
1110
1111         /* If we are flushing data then we care about update_size
1112          * being set, otherwise we care about update_core
1113          */
1114         if ((flag & FSYNC_DATA) ?
1115                         (ip->i_update_size == 0) :
1116                         (ip->i_update_core == 0)) {
1117                 /*
1118                  * Timestamps/size haven't changed since last inode
1119                  * flush or inode transaction commit.  That means
1120                  * either nothing got written or a transaction
1121                  * committed which caught the updates.  If the
1122                  * latter happened and the transaction hasn't
1123                  * hit the disk yet, the inode will be still
1124                  * be pinned.  If it is, force the log.
1125                  */
1126
1127                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1128
1129                 if (xfs_ipincount(ip)) {
1130                         _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
1131                                       XFS_LOG_FORCE |
1132                                       ((flag & FSYNC_WAIT)
1133                                        ? XFS_LOG_SYNC : 0),
1134                                       &log_flushed);
1135                 } else {
1136                         /*
1137                          * If the inode is not pinned and nothing
1138                          * has changed we don't need to flush the
1139                          * cache.
1140                          */
1141                         changed = 0;
1142                 }
1143                 error = 0;
1144         } else  {
1145                 /*
1146                  * Kick off a transaction to log the inode
1147                  * core to get the updates.  Make it
1148                  * sync if FSYNC_WAIT is passed in (which
1149                  * is done by everybody but specfs).  The
1150                  * sync transaction will also force the log.
1151                  */
1152                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
1153                 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
1154                 if ((error = xfs_trans_reserve(tp, 0,
1155                                 XFS_FSYNC_TS_LOG_RES(ip->i_mount),
1156                                 0, 0, 0)))  {
1157                         xfs_trans_cancel(tp, 0);
1158                         return error;
1159                 }
1160                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1161
1162                 /*
1163                  * Note - it's possible that we might have pushed
1164                  * ourselves out of the way during trans_reserve
1165                  * which would flush the inode.  But there's no
1166                  * guarantee that the inode buffer has actually
1167                  * gone out yet (it's delwri).  Plus the buffer
1168                  * could be pinned anyway if it's part of an
1169                  * inode in another recent transaction.  So we
1170                  * play it safe and fire off the transaction anyway.
1171                  */
1172                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1173                 xfs_trans_ihold(tp, ip);
1174                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1175                 if (flag & FSYNC_WAIT)
1176                         xfs_trans_set_sync(tp);
1177                 error = _xfs_trans_commit(tp, 0, &log_flushed);
1178
1179                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1180         }
1181
1182         if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) {
1183                 /*
1184                  * If the log write didn't issue an ordered tag we need
1185                  * to flush the disk cache for the data device now.
1186                  */
1187                 if (!log_flushed)
1188                         xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
1189
1190                 /*
1191                  * If this inode is on the RT dev we need to flush that
1192                  * cache as well.
1193                  */
1194                 if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)
1195                         xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
1196         }
1197
1198         return error;
1199 }
1200
1201 /*
1202  * This is called by xfs_inactive to free any blocks beyond eof
1203  * when the link count isn't zero and by xfs_dm_punch_hole() when
1204  * punching a hole to EOF.
1205  */
1206 int
1207 xfs_free_eofblocks(
1208         xfs_mount_t     *mp,
1209         xfs_inode_t     *ip,
1210         int             flags)
1211 {
1212         xfs_trans_t     *tp;
1213         int             error;
1214         xfs_fileoff_t   end_fsb;
1215         xfs_fileoff_t   last_fsb;
1216         xfs_filblks_t   map_len;
1217         int             nimaps;
1218         xfs_bmbt_irec_t imap;
1219         int             use_iolock = (flags & XFS_FREE_EOF_LOCK);
1220
1221         /*
1222          * Figure out if there are any blocks beyond the end
1223          * of the file.  If not, then there is nothing to do.
1224          */
1225         end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size));
1226         last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1227         map_len = last_fsb - end_fsb;
1228         if (map_len <= 0)
1229                 return 0;
1230
1231         nimaps = 1;
1232         xfs_ilock(ip, XFS_ILOCK_SHARED);
1233         error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0,
1234                           NULL, 0, &imap, &nimaps, NULL, NULL);
1235         xfs_iunlock(ip, XFS_ILOCK_SHARED);
1236
1237         if (!error && (nimaps != 0) &&
1238             (imap.br_startblock != HOLESTARTBLOCK ||
1239              ip->i_delayed_blks)) {
1240                 /*
1241                  * Attach the dquots to the inode up front.
1242                  */
1243                 if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1244                         return error;
1245
1246                 /*
1247                  * There are blocks after the end of file.
1248                  * Free them up now by truncating the file to
1249                  * its current size.
1250                  */
1251                 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1252
1253                 /*
1254                  * Do the xfs_itruncate_start() call before
1255                  * reserving any log space because
1256                  * itruncate_start will call into the buffer
1257                  * cache and we can't
1258                  * do that within a transaction.
1259                  */
1260                 if (use_iolock)
1261                         xfs_ilock(ip, XFS_IOLOCK_EXCL);
1262                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE,
1263                                     ip->i_size);
1264                 if (error) {
1265                         xfs_trans_cancel(tp, 0);
1266                         if (use_iolock)
1267                                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1268                         return error;
1269                 }
1270
1271                 error = xfs_trans_reserve(tp, 0,
1272                                           XFS_ITRUNCATE_LOG_RES(mp),
1273                                           0, XFS_TRANS_PERM_LOG_RES,
1274                                           XFS_ITRUNCATE_LOG_COUNT);
1275                 if (error) {
1276                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1277                         xfs_trans_cancel(tp, 0);
1278                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1279                         return error;
1280                 }
1281
1282                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1283                 xfs_trans_ijoin(tp, ip,
1284                                 XFS_IOLOCK_EXCL |
1285                                 XFS_ILOCK_EXCL);
1286                 xfs_trans_ihold(tp, ip);
1287
1288                 error = xfs_itruncate_finish(&tp, ip,
1289                                              ip->i_size,
1290                                              XFS_DATA_FORK,
1291                                              0);
1292                 /*
1293                  * If we get an error at this point we
1294                  * simply don't bother truncating the file.
1295                  */
1296                 if (error) {
1297                         xfs_trans_cancel(tp,
1298                                          (XFS_TRANS_RELEASE_LOG_RES |
1299                                           XFS_TRANS_ABORT));
1300                 } else {
1301                         error = xfs_trans_commit(tp,
1302                                                 XFS_TRANS_RELEASE_LOG_RES);
1303                 }
1304                 xfs_iunlock(ip, (use_iolock ? (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)
1305                                             : XFS_ILOCK_EXCL));
1306         }
1307         return error;
1308 }
1309
1310 /*
1311  * Free a symlink that has blocks associated with it.
1312  */
1313 STATIC int
1314 xfs_inactive_symlink_rmt(
1315         xfs_inode_t     *ip,
1316         xfs_trans_t     **tpp)
1317 {
1318         xfs_buf_t       *bp;
1319         int             committed;
1320         int             done;
1321         int             error;
1322         xfs_fsblock_t   first_block;
1323         xfs_bmap_free_t free_list;
1324         int             i;
1325         xfs_mount_t     *mp;
1326         xfs_bmbt_irec_t mval[SYMLINK_MAPS];
1327         int             nmaps;
1328         xfs_trans_t     *ntp;
1329         int             size;
1330         xfs_trans_t     *tp;
1331
1332         tp = *tpp;
1333         mp = ip->i_mount;
1334         ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip));
1335         /*
1336          * We're freeing a symlink that has some
1337          * blocks allocated to it.  Free the
1338          * blocks here.  We know that we've got
1339          * either 1 or 2 extents and that we can
1340          * free them all in one bunmapi call.
1341          */
1342         ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
1343         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1344                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1345                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1346                 xfs_trans_cancel(tp, 0);
1347                 *tpp = NULL;
1348                 return error;
1349         }
1350         /*
1351          * Lock the inode, fix the size, and join it to the transaction.
1352          * Hold it so in the normal path, we still have it locked for
1353          * the second transaction.  In the error paths we need it
1354          * held so the cancel won't rele it, see below.
1355          */
1356         xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1357         size = (int)ip->i_d.di_size;
1358         ip->i_d.di_size = 0;
1359         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1360         xfs_trans_ihold(tp, ip);
1361         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1362         /*
1363          * Find the block(s) so we can inval and unmap them.
1364          */
1365         done = 0;
1366         XFS_BMAP_INIT(&free_list, &first_block);
1367         nmaps = ARRAY_SIZE(mval);
1368         if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
1369                         XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
1370                         &free_list, NULL)))
1371                 goto error0;
1372         /*
1373          * Invalidate the block(s).
1374          */
1375         for (i = 0; i < nmaps; i++) {
1376                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
1377                         XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
1378                         XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
1379                 xfs_trans_binval(tp, bp);
1380         }
1381         /*
1382          * Unmap the dead block(s) to the free_list.
1383          */
1384         if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
1385                         &first_block, &free_list, NULL, &done)))
1386                 goto error1;
1387         ASSERT(done);
1388         /*
1389          * Commit the first transaction.  This logs the EFI and the inode.
1390          */
1391         if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
1392                 goto error1;
1393         /*
1394          * The transaction must have been committed, since there were
1395          * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
1396          * The new tp has the extent freeing and EFDs.
1397          */
1398         ASSERT(committed);
1399         /*
1400          * The first xact was committed, so add the inode to the new one.
1401          * Mark it dirty so it will be logged and moved forward in the log as
1402          * part of every commit.
1403          */
1404         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1405         xfs_trans_ihold(tp, ip);
1406         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1407         /*
1408          * Get a new, empty transaction to return to our caller.
1409          */
1410         ntp = xfs_trans_dup(tp);
1411         /*
1412          * Commit the transaction containing extent freeing and EFDs.
1413          * If we get an error on the commit here or on the reserve below,
1414          * we need to unlock the inode since the new transaction doesn't
1415          * have the inode attached.
1416          */
1417         error = xfs_trans_commit(tp, 0);
1418         tp = ntp;
1419         if (error) {
1420                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1421                 goto error0;
1422         }
1423         /*
1424          * Remove the memory for extent descriptions (just bookkeeping).
1425          */
1426         if (ip->i_df.if_bytes)
1427                 xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
1428         ASSERT(ip->i_df.if_bytes == 0);
1429         /*
1430          * Put an itruncate log reservation in the new transaction
1431          * for our caller.
1432          */
1433         if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1434                         XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) {
1435                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1436                 goto error0;
1437         }
1438         /*
1439          * Return with the inode locked but not joined to the transaction.
1440          */
1441         *tpp = tp;
1442         return 0;
1443
1444  error1:
1445         xfs_bmap_cancel(&free_list);
1446  error0:
1447         /*
1448          * Have to come here with the inode locked and either
1449          * (held and in the transaction) or (not in the transaction).
1450          * If the inode isn't held then cancel would iput it, but
1451          * that's wrong since this is inactive and the vnode ref
1452          * count is 0 already.
1453          * Cancel won't do anything to the inode if held, but it still
1454          * needs to be locked until the cancel is done, if it was
1455          * joined to the transaction.
1456          */
1457         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1458         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1459         *tpp = NULL;
1460         return error;
1461
1462 }
1463
1464 STATIC int
1465 xfs_inactive_symlink_local(
1466         xfs_inode_t     *ip,
1467         xfs_trans_t     **tpp)
1468 {
1469         int             error;
1470
1471         ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip));
1472         /*
1473          * We're freeing a symlink which fit into
1474          * the inode.  Just free the memory used
1475          * to hold the old symlink.
1476          */
1477         error = xfs_trans_reserve(*tpp, 0,
1478                                   XFS_ITRUNCATE_LOG_RES(ip->i_mount),
1479                                   0, XFS_TRANS_PERM_LOG_RES,
1480                                   XFS_ITRUNCATE_LOG_COUNT);
1481
1482         if (error) {
1483                 xfs_trans_cancel(*tpp, 0);
1484                 *tpp = NULL;
1485                 return error;
1486         }
1487         xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1488
1489         /*
1490          * Zero length symlinks _can_ exist.
1491          */
1492         if (ip->i_df.if_bytes > 0) {
1493                 xfs_idata_realloc(ip,
1494                                   -(ip->i_df.if_bytes),
1495                                   XFS_DATA_FORK);
1496                 ASSERT(ip->i_df.if_bytes == 0);
1497         }
1498         return 0;
1499 }
1500
1501 STATIC int
1502 xfs_inactive_attrs(
1503         xfs_inode_t     *ip,
1504         xfs_trans_t     **tpp)
1505 {
1506         xfs_trans_t     *tp;
1507         int             error;
1508         xfs_mount_t     *mp;
1509
1510         ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE));
1511         tp = *tpp;
1512         mp = ip->i_mount;
1513         ASSERT(ip->i_d.di_forkoff != 0);
1514         xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1515         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1516
1517         error = xfs_attr_inactive(ip);
1518         if (error) {
1519                 *tpp = NULL;
1520                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1521                 return error; /* goto out */
1522         }
1523
1524         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1525         error = xfs_trans_reserve(tp, 0,
1526                                   XFS_IFREE_LOG_RES(mp),
1527                                   0, XFS_TRANS_PERM_LOG_RES,
1528                                   XFS_INACTIVE_LOG_COUNT);
1529         if (error) {
1530                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1531                 xfs_trans_cancel(tp, 0);
1532                 *tpp = NULL;
1533                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1534                 return error;
1535         }
1536
1537         xfs_ilock(ip, XFS_ILOCK_EXCL);
1538         xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1539         xfs_trans_ihold(tp, ip);
1540         xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1541
1542         ASSERT(ip->i_d.di_anextents == 0);
1543
1544         *tpp = tp;
1545         return 0;
1546 }
1547
1548 STATIC int
1549 xfs_release(
1550         bhv_desc_t      *bdp)
1551 {
1552         xfs_inode_t     *ip;
1553         bhv_vnode_t     *vp;
1554         xfs_mount_t     *mp;
1555         int             error;
1556
1557         vp = BHV_TO_VNODE(bdp);
1558         ip = XFS_BHVTOI(bdp);
1559         mp = ip->i_mount;
1560
1561         if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0))
1562                 return 0;
1563
1564         /* If this is a read-only mount, don't do this (would generate I/O) */
1565         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1566                 return 0;
1567
1568         if (!XFS_FORCED_SHUTDOWN(mp)) {
1569                 /*
1570                  * If we are using filestreams, and we have an unlinked
1571                  * file that we are processing the last close on, then nothing
1572                  * will be able to reopen and write to this file. Purge this
1573                  * inode from the filestreams cache so that it doesn't delay
1574                  * teardown of the inode.
1575                  */
1576                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1577                         xfs_filestream_deassociate(ip);
1578
1579                 /*
1580                  * If we previously truncated this file and removed old data
1581                  * in the process, we want to initiate "early" writeout on
1582                  * the last close.  This is an attempt to combat the notorious
1583                  * NULL files problem which is particularly noticable from a
1584                  * truncate down, buffered (re-)write (delalloc), followed by
1585                  * a crash.  What we are effectively doing here is
1586                  * significantly reducing the time window where we'd otherwise
1587                  * be exposed to that problem.
1588                  */
1589                 if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
1590                         bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE);
1591         }
1592
1593 #ifdef HAVE_REFCACHE
1594         /* If we are in the NFS reference cache then don't do this now */
1595         if (ip->i_refcache)
1596                 return 0;
1597 #endif
1598
1599         if (ip->i_d.di_nlink != 0) {
1600                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1601                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1602                        ip->i_delayed_blks > 0)) &&
1603                      (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
1604                     (!(ip->i_d.di_flags &
1605                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
1606                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1607                         if (error)
1608                                 return error;
1609                         /* Update linux inode block count after free above */
1610                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1611                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1612                 }
1613         }
1614
1615         return 0;
1616 }
1617
1618 /*
1619  * xfs_inactive
1620  *
1621  * This is called when the vnode reference count for the vnode
1622  * goes to zero.  If the file has been unlinked, then it must
1623  * now be truncated.  Also, we clear all of the read-ahead state
1624  * kept for the inode here since the file is now closed.
1625  */
1626 STATIC int
1627 xfs_inactive(
1628         bhv_desc_t      *bdp,
1629         cred_t          *credp)
1630 {
1631         xfs_inode_t     *ip;
1632         bhv_vnode_t     *vp;
1633         xfs_bmap_free_t free_list;
1634         xfs_fsblock_t   first_block;
1635         int             committed;
1636         xfs_trans_t     *tp;
1637         xfs_mount_t     *mp;
1638         int             error;
1639         int             truncate;
1640
1641         vp = BHV_TO_VNODE(bdp);
1642         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
1643
1644         ip = XFS_BHVTOI(bdp);
1645
1646         /*
1647          * If the inode is already free, then there can be nothing
1648          * to clean up here.
1649          */
1650         if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
1651                 ASSERT(ip->i_df.if_real_bytes == 0);
1652                 ASSERT(ip->i_df.if_broot_bytes == 0);
1653                 return VN_INACTIVE_CACHE;
1654         }
1655
1656         /*
1657          * Only do a truncate if it's a regular file with
1658          * some actual space in it.  It's OK to look at the
1659          * inode's fields without the lock because we're the
1660          * only one with a reference to the inode.
1661          */
1662         truncate = ((ip->i_d.di_nlink == 0) &&
1663             ((ip->i_d.di_size != 0) || (ip->i_size != 0) ||
1664              (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
1665             ((ip->i_d.di_mode & S_IFMT) == S_IFREG));
1666
1667         mp = ip->i_mount;
1668
1669         if (ip->i_d.di_nlink == 0 &&
1670             DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) {
1671                 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL);
1672         }
1673
1674         error = 0;
1675
1676         /* If this is a read-only mount, don't do this (would generate I/O) */
1677         if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
1678                 goto out;
1679
1680         if (ip->i_d.di_nlink != 0) {
1681                 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1682                      ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
1683                        ip->i_delayed_blks > 0)) &&
1684                       (ip->i_df.if_flags & XFS_IFEXTENTS) &&
1685                      (!(ip->i_d.di_flags &
1686                                 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
1687                       (ip->i_delayed_blks != 0)))) {
1688                         error = xfs_free_eofblocks(mp, ip, XFS_FREE_EOF_LOCK);
1689                         if (error)
1690                                 return VN_INACTIVE_CACHE;
1691                         /* Update linux inode block count after free above */
1692                         vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp,
1693                                 ip->i_d.di_nblocks + ip->i_delayed_blks);
1694                 }
1695                 goto out;
1696         }
1697
1698         ASSERT(ip->i_d.di_nlink == 0);
1699
1700         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
1701                 return VN_INACTIVE_CACHE;
1702
1703         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1704         if (truncate) {
1705                 /*
1706                  * Do the xfs_itruncate_start() call before
1707                  * reserving any log space because itruncate_start
1708                  * will call into the buffer cache and we can't
1709                  * do that within a transaction.
1710                  */
1711                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
1712
1713                 error = xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0);
1714                 if (error) {
1715                         xfs_trans_cancel(tp, 0);
1716                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1717                         return VN_INACTIVE_CACHE;
1718                 }
1719
1720                 error = xfs_trans_reserve(tp, 0,
1721                                           XFS_ITRUNCATE_LOG_RES(mp),
1722                                           0, XFS_TRANS_PERM_LOG_RES,
1723                                           XFS_ITRUNCATE_LOG_COUNT);
1724                 if (error) {
1725                         /* Don't call itruncate_cleanup */
1726                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1727                         xfs_trans_cancel(tp, 0);
1728                         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1729                         return VN_INACTIVE_CACHE;
1730                 }
1731
1732                 xfs_ilock(ip, XFS_ILOCK_EXCL);
1733                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1734                 xfs_trans_ihold(tp, ip);
1735
1736                 /*
1737                  * normally, we have to run xfs_itruncate_finish sync.
1738                  * But if filesystem is wsync and we're in the inactive
1739                  * path, then we know that nlink == 0, and that the
1740                  * xaction that made nlink == 0 is permanently committed
1741                  * since xfs_remove runs as a synchronous transaction.
1742                  */
1743                 error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK,
1744                                 (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0));
1745
1746                 if (error) {
1747                         xfs_trans_cancel(tp,
1748                                 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1749                         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1750                         return VN_INACTIVE_CACHE;
1751                 }
1752         } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) {
1753
1754                 /*
1755                  * If we get an error while cleaning up a
1756                  * symlink we bail out.
1757                  */
1758                 error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ?
1759                         xfs_inactive_symlink_rmt(ip, &tp) :
1760                         xfs_inactive_symlink_local(ip, &tp);
1761
1762                 if (error) {
1763                         ASSERT(tp == NULL);
1764                         return VN_INACTIVE_CACHE;
1765                 }
1766
1767                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1768                 xfs_trans_ihold(tp, ip);
1769         } else {
1770                 error = xfs_trans_reserve(tp, 0,
1771                                           XFS_IFREE_LOG_RES(mp),
1772                                           0, XFS_TRANS_PERM_LOG_RES,
1773                                           XFS_INACTIVE_LOG_COUNT);
1774                 if (error) {
1775                         ASSERT(XFS_FORCED_SHUTDOWN(mp));
1776                         xfs_trans_cancel(tp, 0);
1777                         return VN_INACTIVE_CACHE;
1778                 }
1779
1780                 xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1781                 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1782                 xfs_trans_ihold(tp, ip);
1783         }
1784
1785         /*
1786          * If there are attributes associated with the file
1787          * then blow them away now.  The code calls a routine
1788          * that recursively deconstructs the attribute fork.
1789          * We need to just commit the current transaction
1790          * because we can't use it for xfs_attr_inactive().
1791          */
1792         if (ip->i_d.di_anextents > 0) {
1793                 error = xfs_inactive_attrs(ip, &tp);
1794                 /*
1795                  * If we got an error, the transaction is already
1796                  * cancelled, and the inode is unlocked. Just get out.
1797                  */
1798                  if (error)
1799                          return VN_INACTIVE_CACHE;
1800         } else if (ip->i_afp) {
1801                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1802         }
1803
1804         /*
1805          * Free the inode.
1806          */
1807         XFS_BMAP_INIT(&free_list, &first_block);
1808         error = xfs_ifree(tp, ip, &free_list);
1809         if (error) {
1810                 /*
1811                  * If we fail to free the inode, shut down.  The cancel
1812                  * might do that, we need to make sure.  Otherwise the
1813                  * inode might be lost for a long time or forever.
1814                  */
1815                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1816                         cmn_err(CE_NOTE,
1817                 "xfs_inactive:  xfs_ifree() returned an error = %d on %s",
1818                                 error, mp->m_fsname);
1819                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1820                 }
1821                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1822         } else {
1823                 /*
1824                  * Credit the quota account(s). The inode is gone.
1825                  */
1826                 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1827
1828                 /*
1829                  * Just ignore errors at this point.  There is
1830                  * nothing we can do except to try to keep going.
1831                  */
1832                 (void) xfs_bmap_finish(&tp,  &free_list, &committed);
1833                 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1834         }
1835         /*
1836          * Release the dquots held by inode, if any.
1837          */
1838         XFS_QM_DQDETACH(mp, ip);
1839
1840         xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
1841
1842  out:
1843         return VN_INACTIVE_CACHE;
1844 }
1845
1846
1847 /*
1848  * xfs_lookup
1849  */
1850 STATIC int
1851 xfs_lookup(
1852         bhv_desc_t              *dir_bdp,
1853         bhv_vname_t             *dentry,
1854         bhv_vnode_t             **vpp,
1855         int                     flags,
1856         bhv_vnode_t             *rdir,
1857         cred_t                  *credp)
1858 {
1859         xfs_inode_t             *dp, *ip;
1860         xfs_ino_t               e_inum;
1861         int                     error;
1862         uint                    lock_mode;
1863         bhv_vnode_t             *dir_vp;
1864
1865         dir_vp = BHV_TO_VNODE(dir_bdp);
1866         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1867
1868         dp = XFS_BHVTOI(dir_bdp);
1869
1870         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
1871                 return XFS_ERROR(EIO);
1872
1873         lock_mode = xfs_ilock_map_shared(dp);
1874         error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip);
1875         if (!error) {
1876                 *vpp = XFS_ITOV(ip);
1877                 ITRACE(ip);
1878         }
1879         xfs_iunlock_map_shared(dp, lock_mode);
1880         return error;
1881 }
1882
1883
1884 /*
1885  * xfs_create (create a new file).
1886  */
1887 STATIC int
1888 xfs_create(
1889         bhv_desc_t              *dir_bdp,
1890         bhv_vname_t             *dentry,
1891         bhv_vattr_t             *vap,
1892         bhv_vnode_t             **vpp,
1893         cred_t                  *credp)
1894 {
1895         char                    *name = VNAME(dentry);
1896         bhv_vnode_t             *dir_vp;
1897         xfs_inode_t             *dp, *ip;
1898         bhv_vnode_t             *vp = NULL;
1899         xfs_trans_t             *tp;
1900         xfs_mount_t             *mp;
1901         xfs_dev_t               rdev;
1902         int                     error;
1903         xfs_bmap_free_t         free_list;
1904         xfs_fsblock_t           first_block;
1905         boolean_t               dp_joined_to_trans;
1906         int                     dm_event_sent = 0;
1907         uint                    cancel_flags;
1908         int                     committed;
1909         xfs_prid_t              prid;
1910         struct xfs_dquot        *udqp, *gdqp;
1911         uint                    resblks;
1912         int                     dm_di_mode;
1913         int                     namelen;
1914
1915         ASSERT(!*vpp);
1916         dir_vp = BHV_TO_VNODE(dir_bdp);
1917         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
1918
1919         dp = XFS_BHVTOI(dir_bdp);
1920         mp = dp->i_mount;
1921
1922         dm_di_mode = vap->va_mode;
1923         namelen = VNAMELEN(dentry);
1924
1925         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
1926                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1927                                 dir_vp, DM_RIGHT_NULL, NULL,
1928                                 DM_RIGHT_NULL, name, NULL,
1929                                 dm_di_mode, 0, 0);
1930
1931                 if (error)
1932                         return error;
1933                 dm_event_sent = 1;
1934         }
1935
1936         if (XFS_FORCED_SHUTDOWN(mp))
1937                 return XFS_ERROR(EIO);
1938
1939         /* Return through std_return after this point. */
1940
1941         udqp = gdqp = NULL;
1942         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1943                 prid = dp->i_d.di_projid;
1944         else if (vap->va_mask & XFS_AT_PROJID)
1945                 prid = (xfs_prid_t)vap->va_projid;
1946         else
1947                 prid = (xfs_prid_t)dfltprid;
1948
1949         /*
1950          * Make sure that we have allocated dquot(s) on disk.
1951          */
1952         error = XFS_QM_DQVOPALLOC(mp, dp,
1953                         current_fsuid(credp), current_fsgid(credp), prid,
1954                         XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
1955         if (error)
1956                 goto std_return;
1957
1958         ip = NULL;
1959         dp_joined_to_trans = B_FALSE;
1960
1961         tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1962         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1963         resblks = XFS_CREATE_SPACE_RES(mp, namelen);
1964         /*
1965          * Initially assume that the file does not exist and
1966          * reserve the resources for that case.  If that is not
1967          * the case we'll drop the one we have and get a more
1968          * appropriate transaction later.
1969          */
1970         error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
1971                         XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1972         if (error == ENOSPC) {
1973                 resblks = 0;
1974                 error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0,
1975                                 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
1976         }
1977         if (error) {
1978                 cancel_flags = 0;
1979                 dp = NULL;
1980                 goto error_return;
1981         }
1982
1983         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1984
1985         XFS_BMAP_INIT(&free_list, &first_block);
1986
1987         ASSERT(ip == NULL);
1988
1989         /*
1990          * Reserve disk quota and the inode.
1991          */
1992         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
1993         if (error)
1994                 goto error_return;
1995
1996         if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen)))
1997                 goto error_return;
1998         rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
1999         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
2000                         rdev, credp, prid, resblks > 0,
2001                         &ip, &committed);
2002         if (error) {
2003                 if (error == ENOSPC)
2004                         goto error_return;
2005                 goto abort_return;
2006         }
2007         ITRACE(ip);
2008
2009         /*
2010          * At this point, we've gotten a newly allocated inode.
2011          * It is locked (and joined to the transaction).
2012          */
2013
2014         ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE));
2015
2016         /*
2017          * Now we join the directory inode to the transaction.
2018          * We do not do it earlier because xfs_dir_ialloc
2019          * might commit the previous transaction (and release
2020          * all the locks).
2021          */
2022
2023         VN_HOLD(dir_vp);
2024         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2025         dp_joined_to_trans = B_TRUE;
2026
2027         error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino,
2028                                         &first_block, &free_list, resblks ?
2029                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2030         if (error) {
2031                 ASSERT(error != ENOSPC);
2032                 goto abort_return;
2033         }
2034         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2035         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2036
2037         /*
2038          * If this is a synchronous mount, make sure that the
2039          * create transaction goes to disk before returning to
2040          * the user.
2041          */
2042         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2043                 xfs_trans_set_sync(tp);
2044         }
2045
2046         dp->i_gen++;
2047
2048         /*
2049          * Attach the dquot(s) to the inodes and modify them incore.
2050          * These ids of the inode couldn't have changed since the new
2051          * inode has been locked ever since it was created.
2052          */
2053         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
2054
2055         /*
2056          * xfs_trans_commit normally decrements the vnode ref count
2057          * when it unlocks the inode. Since we want to return the
2058          * vnode to the caller, we bump the vnode ref count now.
2059          */
2060         IHOLD(ip);
2061         vp = XFS_ITOV(ip);
2062
2063         error = xfs_bmap_finish(&tp, &free_list, &committed);
2064         if (error) {
2065                 xfs_bmap_cancel(&free_list);
2066                 goto abort_rele;
2067         }
2068
2069         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2070         if (error) {
2071                 IRELE(ip);
2072                 tp = NULL;
2073                 goto error_return;
2074         }
2075
2076         XFS_QM_DQRELE(mp, udqp);
2077         XFS_QM_DQRELE(mp, gdqp);
2078
2079         /*
2080          * Propagate the fact that the vnode changed after the
2081          * xfs_inode locks have been released.
2082          */
2083         bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3);
2084
2085         *vpp = vp;
2086
2087         /* Fallthrough to std_return with error = 0  */
2088
2089 std_return:
2090         if ( (*vpp || (error != 0 && dm_event_sent != 0)) &&
2091                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
2092                                                         DM_EVENT_POSTCREATE)) {
2093                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2094                         dir_vp, DM_RIGHT_NULL,
2095                         *vpp ? vp:NULL,
2096                         DM_RIGHT_NULL, name, NULL,
2097                         dm_di_mode, error, 0);
2098         }
2099         return error;
2100
2101  abort_return:
2102         cancel_flags |= XFS_TRANS_ABORT;
2103         /* FALLTHROUGH */
2104
2105  error_return:
2106         if (tp != NULL)
2107                 xfs_trans_cancel(tp, cancel_flags);
2108
2109         if (!dp_joined_to_trans && (dp != NULL))
2110                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2111         XFS_QM_DQRELE(mp, udqp);
2112         XFS_QM_DQRELE(mp, gdqp);
2113
2114         goto std_return;
2115
2116  abort_rele:
2117         /*
2118          * Wait until after the current transaction is aborted to
2119          * release the inode.  This prevents recursive transactions
2120          * and deadlocks from xfs_inactive.
2121          */
2122         cancel_flags |= XFS_TRANS_ABORT;
2123         xfs_trans_cancel(tp, cancel_flags);
2124         IRELE(ip);
2125
2126         XFS_QM_DQRELE(mp, udqp);
2127         XFS_QM_DQRELE(mp, gdqp);
2128
2129         goto std_return;
2130 }
2131
2132 #ifdef DEBUG
2133 /*
2134  * Some counters to see if (and how often) we are hitting some deadlock
2135  * prevention code paths.
2136  */
2137
2138 int xfs_rm_locks;
2139 int xfs_rm_lock_delays;
2140 int xfs_rm_attempts;
2141 #endif
2142
2143 /*
2144  * The following routine will lock the inodes associated with the
2145  * directory and the named entry in the directory. The locks are
2146  * acquired in increasing inode number.
2147  *
2148  * If the entry is "..", then only the directory is locked. The
2149  * vnode ref count will still include that from the .. entry in
2150  * this case.
2151  *
2152  * There is a deadlock we need to worry about. If the locked directory is
2153  * in the AIL, it might be blocking up the log. The next inode we lock
2154  * could be already locked by another thread waiting for log space (e.g
2155  * a permanent log reservation with a long running transaction (see
2156  * xfs_itruncate_finish)). To solve this, we must check if the directory
2157  * is in the ail and use lock_nowait. If we can't lock, we need to
2158  * drop the inode lock on the directory and try again. xfs_iunlock will
2159  * potentially push the tail if we were holding up the log.
2160  */
2161 STATIC int
2162 xfs_lock_dir_and_entry(
2163         xfs_inode_t     *dp,
2164         xfs_inode_t     *ip)    /* inode of entry 'name' */
2165 {
2166         int             attempts;
2167         xfs_ino_t       e_inum;
2168         xfs_inode_t     *ips[2];
2169         xfs_log_item_t  *lp;
2170
2171 #ifdef DEBUG
2172         xfs_rm_locks++;
2173 #endif
2174         attempts = 0;
2175
2176 again:
2177         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2178
2179         e_inum = ip->i_ino;
2180
2181         ITRACE(ip);
2182
2183         /*
2184          * We want to lock in increasing inum. Since we've already
2185          * acquired the lock on the directory, we may need to release
2186          * if if the inum of the entry turns out to be less.
2187          */
2188         if (e_inum > dp->i_ino) {
2189                 /*
2190                  * We are already in the right order, so just
2191                  * lock on the inode of the entry.
2192                  * We need to use nowait if dp is in the AIL.
2193                  */
2194
2195                 lp = (xfs_log_item_t *)dp->i_itemp;
2196                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2197                         if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2198                                 attempts++;
2199 #ifdef DEBUG
2200                                 xfs_rm_attempts++;
2201 #endif
2202
2203                                 /*
2204                                  * Unlock dp and try again.
2205                                  * xfs_iunlock will try to push the tail
2206                                  * if the inode is in the AIL.
2207                                  */
2208
2209                                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2210
2211                                 if ((attempts % 5) == 0) {
2212                                         delay(1); /* Don't just spin the CPU */
2213 #ifdef DEBUG
2214                                         xfs_rm_lock_delays++;
2215 #endif
2216                                 }
2217                                 goto again;
2218                         }
2219                 } else {
2220                         xfs_ilock(ip, XFS_ILOCK_EXCL);
2221                 }
2222         } else if (e_inum < dp->i_ino) {
2223                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
2224
2225                 ips[0] = ip;
2226                 ips[1] = dp;
2227                 xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2228         }
2229         /* else  e_inum == dp->i_ino */
2230         /*     This can happen if we're asked to lock /x/..
2231          *     the entry is "..", which is also the parent directory.
2232          */
2233
2234         return 0;
2235 }
2236
2237 #ifdef DEBUG
2238 int xfs_locked_n;
2239 int xfs_small_retries;
2240 int xfs_middle_retries;
2241 int xfs_lots_retries;
2242 int xfs_lock_delays;
2243 #endif
2244
2245 /*
2246  * Bump the subclass so xfs_lock_inodes() acquires each lock with
2247  * a different value
2248  */
2249 static inline int
2250 xfs_lock_inumorder(int lock_mode, int subclass)
2251 {
2252         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
2253                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
2254         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
2255                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
2256
2257         return lock_mode;
2258 }
2259
2260 /*
2261  * The following routine will lock n inodes in exclusive mode.
2262  * We assume the caller calls us with the inodes in i_ino order.
2263  *
2264  * We need to detect deadlock where an inode that we lock
2265  * is in the AIL and we start waiting for another inode that is locked
2266  * by a thread in a long running transaction (such as truncate). This can
2267  * result in deadlock since the long running trans might need to wait
2268  * for the inode we just locked in order to push the tail and free space
2269  * in the log.
2270  */
2271 void
2272 xfs_lock_inodes(
2273         xfs_inode_t     **ips,
2274         int             inodes,
2275         int             first_locked,
2276         uint            lock_mode)
2277 {
2278         int             attempts = 0, i, j, try_lock;
2279         xfs_log_item_t  *lp;
2280
2281         ASSERT(ips && (inodes >= 2)); /* we need at least two */
2282
2283         if (first_locked) {
2284                 try_lock = 1;
2285                 i = 1;
2286         } else {
2287                 try_lock = 0;
2288                 i = 0;
2289         }
2290
2291 again:
2292         for (; i < inodes; i++) {
2293                 ASSERT(ips[i]);
2294
2295                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
2296                         continue;
2297
2298                 /*
2299                  * If try_lock is not set yet, make sure all locked inodes
2300                  * are not in the AIL.
2301                  * If any are, set try_lock to be used later.
2302                  */
2303
2304                 if (!try_lock) {
2305                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
2306                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
2307                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
2308                                         try_lock++;
2309                                 }
2310                         }
2311                 }
2312
2313                 /*
2314                  * If any of the previous locks we have locked is in the AIL,
2315                  * we must TRY to get the second and subsequent locks. If
2316                  * we can't get any, we must release all we have
2317                  * and try again.
2318                  */
2319
2320                 if (try_lock) {
2321                         /* try_lock must be 0 if i is 0. */
2322                         /*
2323                          * try_lock means we have an inode locked
2324                          * that is in the AIL.
2325                          */
2326                         ASSERT(i != 0);
2327                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
2328                                 attempts++;
2329
2330                                 /*
2331                                  * Unlock all previous guys and try again.
2332                                  * xfs_iunlock will try to push the tail
2333                                  * if the inode is in the AIL.
2334                                  */
2335
2336                                 for(j = i - 1; j >= 0; j--) {
2337
2338                                         /*
2339                                          * Check to see if we've already
2340                                          * unlocked this one.
2341                                          * Not the first one going back,
2342                                          * and the inode ptr is the same.
2343                                          */
2344                                         if ((j != (i - 1)) && ips[j] ==
2345                                                                 ips[j+1])
2346                                                 continue;
2347
2348                                         xfs_iunlock(ips[j], lock_mode);
2349                                 }
2350
2351                                 if ((attempts % 5) == 0) {
2352                                         delay(1); /* Don't just spin the CPU */
2353 #ifdef DEBUG
2354                                         xfs_lock_delays++;
2355 #endif
2356                                 }
2357                                 i = 0;
2358                                 try_lock = 0;
2359                                 goto again;
2360                         }
2361                 } else {
2362                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
2363                 }
2364         }
2365
2366 #ifdef DEBUG
2367         if (attempts) {
2368                 if (attempts < 5) xfs_small_retries++;
2369                 else if (attempts < 100) xfs_middle_retries++;
2370                 else xfs_lots_retries++;
2371         } else {
2372                 xfs_locked_n++;
2373         }
2374 #endif
2375 }
2376
2377 #ifdef  DEBUG
2378 #define REMOVE_DEBUG_TRACE(x)   {remove_which_error_return = (x);}
2379 int remove_which_error_return = 0;
2380 #else /* ! DEBUG */
2381 #define REMOVE_DEBUG_TRACE(x)
2382 #endif  /* ! DEBUG */
2383
2384
2385 /*
2386  * xfs_remove
2387  *
2388  */
2389 STATIC int
2390 xfs_remove(
2391         bhv_desc_t              *dir_bdp,
2392         bhv_vname_t             *dentry,
2393         cred_t                  *credp)
2394 {
2395         bhv_vnode_t             *dir_vp;
2396         char                    *name = VNAME(dentry);
2397         xfs_inode_t             *dp, *ip;
2398         xfs_trans_t             *tp = NULL;
2399         xfs_mount_t             *mp;
2400         int                     error = 0;
2401         xfs_bmap_free_t         free_list;
2402         xfs_fsblock_t           first_block;
2403         int                     cancel_flags;
2404         int                     committed;
2405         int                     dm_di_mode = 0;
2406         int                     link_zero;
2407         uint                    resblks;
2408         int                     namelen;
2409
2410         dir_vp = BHV_TO_VNODE(dir_bdp);
2411         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2412
2413         dp = XFS_BHVTOI(dir_bdp);
2414         mp = dp->i_mount;
2415
2416         if (XFS_FORCED_SHUTDOWN(mp))
2417                 return XFS_ERROR(EIO);
2418
2419         namelen = VNAMELEN(dentry);
2420
2421         if (!xfs_get_dir_entry(dentry, &ip)) {
2422                 dm_di_mode = ip->i_d.di_mode;
2423                 IRELE(ip);
2424         }
2425
2426         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
2427                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp,
2428                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
2429                                         name, NULL, dm_di_mode, 0, 0);
2430                 if (error)
2431                         return error;
2432         }
2433
2434         /* From this point on, return through std_return */
2435         ip = NULL;
2436
2437         /*
2438          * We need to get a reference to ip before we get our log
2439          * reservation. The reason for this is that we cannot call
2440          * xfs_iget for an inode for which we do not have a reference
2441          * once we've acquired a log reservation. This is because the
2442          * inode we are trying to get might be in xfs_inactive going
2443          * for a log reservation. Since we'll have to wait for the
2444          * inactive code to complete before returning from xfs_iget,
2445          * we need to make sure that we don't have log space reserved
2446          * when we call xfs_iget.  Instead we get an unlocked reference
2447          * to the inode before getting our log reservation.
2448          */
2449         error = xfs_get_dir_entry(dentry, &ip);
2450         if (error) {
2451                 REMOVE_DEBUG_TRACE(__LINE__);
2452                 goto std_return;
2453         }
2454
2455         dm_di_mode = ip->i_d.di_mode;
2456
2457         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2458
2459         ITRACE(ip);
2460
2461         error = XFS_QM_DQATTACH(mp, dp, 0);
2462         if (!error && dp != ip)
2463                 error = XFS_QM_DQATTACH(mp, ip, 0);
2464         if (error) {
2465                 REMOVE_DEBUG_TRACE(__LINE__);
2466                 IRELE(ip);
2467                 goto std_return;
2468         }
2469
2470         tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2471         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2472         /*
2473          * We try to get the real space reservation first,
2474          * allowing for directory btree deletion(s) implying
2475          * possible bmap insert(s).  If we can't get the space
2476          * reservation then we use 0 instead, and avoid the bmap
2477          * btree insert(s) in the directory code by, if the bmap
2478          * insert tries to happen, instead trimming the LAST
2479          * block from the directory.
2480          */
2481         resblks = XFS_REMOVE_SPACE_RES(mp);
2482         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
2483                         XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2484         if (error == ENOSPC) {
2485                 resblks = 0;
2486                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
2487                                 XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
2488         }
2489         if (error) {
2490                 ASSERT(error != ENOSPC);
2491                 REMOVE_DEBUG_TRACE(__LINE__);
2492                 xfs_trans_cancel(tp, 0);
2493                 IRELE(ip);
2494                 return error;
2495         }
2496
2497         error = xfs_lock_dir_and_entry(dp, ip);
2498         if (error) {
2499                 REMOVE_DEBUG_TRACE(__LINE__);
2500                 xfs_trans_cancel(tp, cancel_flags);
2501                 IRELE(ip);
2502                 goto std_return;
2503         }
2504
2505         /*
2506          * At this point, we've gotten both the directory and the entry
2507          * inodes locked.
2508          */
2509         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2510         if (dp != ip) {
2511                 /*
2512                  * Increment vnode ref count only in this case since
2513                  * there's an extra vnode reference in the case where
2514                  * dp == ip.
2515                  */
2516                 IHOLD(dp);
2517                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2518         }
2519
2520         /*
2521          * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2522          */
2523         XFS_BMAP_INIT(&free_list, &first_block);
2524         error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino,
2525                                         &first_block, &free_list, 0);
2526         if (error) {
2527                 ASSERT(error != ENOENT);
2528                 REMOVE_DEBUG_TRACE(__LINE__);
2529                 goto error1;
2530         }
2531         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2532
2533         dp->i_gen++;
2534         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2535
2536         error = xfs_droplink(tp, ip);
2537         if (error) {
2538                 REMOVE_DEBUG_TRACE(__LINE__);
2539                 goto error1;
2540         }
2541
2542         /* Determine if this is the last link while
2543          * we are in the transaction.
2544          */
2545         link_zero = (ip)->i_d.di_nlink==0;
2546
2547         /*
2548          * Take an extra ref on the inode so that it doesn't
2549          * go to xfs_inactive() from within the commit.
2550          */
2551         IHOLD(ip);
2552
2553         /*
2554          * If this is a synchronous mount, make sure that the
2555          * remove transaction goes to disk before returning to
2556          * the user.
2557          */
2558         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2559                 xfs_trans_set_sync(tp);
2560         }
2561
2562         error = xfs_bmap_finish(&tp, &free_list, &committed);
2563         if (error) {
2564                 REMOVE_DEBUG_TRACE(__LINE__);
2565                 goto error_rele;
2566         }
2567
2568         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2569         if (error) {
2570                 IRELE(ip);
2571                 goto std_return;
2572         }
2573
2574         /*
2575          * Before we drop our extra reference to the inode, purge it
2576          * from the refcache if it is there.  By waiting until afterwards
2577          * to do the IRELE, we ensure that we won't go inactive in the
2578          * xfs_refcache_purge_ip routine (although that would be OK).
2579          */
2580         xfs_refcache_purge_ip(ip);
2581
2582         /*
2583          * If we are using filestreams, kill the stream association.
2584          * If the file is still open it may get a new one but that
2585          * will get killed on last close in xfs_close() so we don't
2586          * have to worry about that.
2587          */
2588         if (link_zero && xfs_inode_is_filestream(ip))
2589                 xfs_filestream_deassociate(ip);
2590
2591         vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
2592
2593         /*
2594          * Let interposed file systems know about removed links.
2595          */
2596         bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero);
2597
2598         IRELE(ip);
2599
2600 /*      Fall through to std_return with error = 0 */
2601  std_return:
2602         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp,
2603                                                 DM_EVENT_POSTREMOVE)) {
2604                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2605                                 dir_vp, DM_RIGHT_NULL,
2606                                 NULL, DM_RIGHT_NULL,
2607                                 name, NULL, dm_di_mode, error, 0);
2608         }
2609         return error;
2610
2611  error1:
2612         xfs_bmap_cancel(&free_list);
2613         cancel_flags |= XFS_TRANS_ABORT;
2614         xfs_trans_cancel(tp, cancel_flags);
2615         goto std_return;
2616
2617  error_rele:
2618         /*
2619          * In this case make sure to not release the inode until after
2620          * the current transaction is aborted.  Releasing it beforehand
2621          * can cause us to go to xfs_inactive and start a recursive
2622          * transaction which can easily deadlock with the current one.
2623          */
2624         xfs_bmap_cancel(&free_list);
2625         cancel_flags |= XFS_TRANS_ABORT;
2626         xfs_trans_cancel(tp, cancel_flags);
2627
2628         /*
2629          * Before we drop our extra reference to the inode, purge it
2630          * from the refcache if it is there.  By waiting until afterwards
2631          * to do the IRELE, we ensure that we won't go inactive in the
2632          * xfs_refcache_purge_ip routine (although that would be OK).
2633          */
2634         xfs_refcache_purge_ip(ip);
2635
2636         IRELE(ip);
2637
2638         goto std_return;
2639 }
2640
2641
2642 /*
2643  * xfs_link
2644  *
2645  */
2646 STATIC int
2647 xfs_link(
2648         bhv_desc_t              *target_dir_bdp,
2649         bhv_vnode_t             *src_vp,
2650         bhv_vname_t             *dentry,
2651         cred_t                  *credp)
2652 {
2653         xfs_inode_t             *tdp, *sip;
2654         xfs_trans_t             *tp;
2655         xfs_mount_t             *mp;
2656         xfs_inode_t             *ips[2];
2657         int                     error;
2658         xfs_bmap_free_t         free_list;
2659         xfs_fsblock_t           first_block;
2660         int                     cancel_flags;
2661         int                     committed;
2662         bhv_vnode_t             *target_dir_vp;
2663         int                     resblks;
2664         char                    *target_name = VNAME(dentry);
2665         int                     target_namelen;
2666
2667         target_dir_vp = BHV_TO_VNODE(target_dir_bdp);
2668         vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address);
2669         vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
2670
2671         target_namelen = VNAMELEN(dentry);
2672         ASSERT(!VN_ISDIR(src_vp));
2673
2674         sip = xfs_vtoi(src_vp);
2675         tdp = XFS_BHVTOI(target_dir_bdp);
2676         mp = tdp->i_mount;
2677         if (XFS_FORCED_SHUTDOWN(mp))
2678                 return XFS_ERROR(EIO);
2679
2680         if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) {
2681                 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2682                                         target_dir_vp, DM_RIGHT_NULL,
2683                                         src_vp, DM_RIGHT_NULL,
2684                                         target_name, NULL, 0, 0, 0);
2685                 if (error)
2686                         return error;
2687         }
2688
2689         /* Return through std_return after this point. */
2690
2691         error = XFS_QM_DQATTACH(mp, sip, 0);
2692         if (!error && sip != tdp)
2693                 error = XFS_QM_DQATTACH(mp, tdp, 0);
2694         if (error)
2695                 goto std_return;
2696
2697         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2698         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2699         resblks = XFS_LINK_SPACE_RES(mp, target_namelen);
2700         error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2701                         XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2702         if (error == ENOSPC) {
2703                 resblks = 0;
2704                 error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0,
2705                                 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2706         }
2707         if (error) {
2708                 cancel_flags = 0;
2709                 goto error_return;
2710         }
2711
2712         if (sip->i_ino < tdp->i_ino) {
2713                 ips[0] = sip;
2714                 ips[1] = tdp;
2715         } else {
2716                 ips[0] = tdp;
2717                 ips[1] = sip;
2718         }
2719
2720         xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL);
2721
2722         /*
2723          * Increment vnode ref counts since xfs_trans_commit &
2724          * xfs_trans_cancel will both unlock the inodes and
2725          * decrement the associated ref counts.
2726          */
2727         VN_HOLD(src_vp);
2728         VN_HOLD(target_dir_vp);
2729         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2730         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2731
2732         /*
2733          * If the source has too many links, we can't make any more to it.
2734          */
2735         if (sip->i_d.di_nlink >= XFS_MAXLINK) {
2736                 error = XFS_ERROR(EMLINK);
2737                 goto error_return;
2738         }
2739
2740         /*
2741          * If we are using project inheritance, we only allow hard link
2742          * creation in our tree when the project IDs are the same; else
2743          * the tree quota mechanism could be circumvented.
2744          */
2745         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2746                      (tdp->i_d.di_projid != sip->i_d.di_projid))) {
2747                 error = XFS_ERROR(EXDEV);
2748                 goto error_return;
2749         }
2750
2751         if (resblks == 0 &&
2752             (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen)))
2753                 goto error_return;
2754
2755         XFS_BMAP_INIT(&free_list, &first_block);
2756
2757         error = xfs_dir_createname(tp, tdp, target_name, target_namelen,
2758                                    sip->i_ino, &first_block, &free_list,
2759                                    resblks);
2760         if (error)
2761                 goto abort_return;
2762         xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2763         tdp->i_gen++;
2764         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
2765
2766         error = xfs_bumplink(tp, sip);
2767         if (error)
2768                 goto abort_return;
2769
2770         /*
2771          * If this is a synchronous mount, make sure that the
2772          * link transaction goes to disk before returning to
2773          * the user.
2774          */
2775         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2776                 xfs_trans_set_sync(tp);
2777         }
2778
2779         error = xfs_bmap_finish (&tp, &free_list, &committed);
2780         if (error) {
2781                 xfs_bmap_cancel(&free_list);
2782                 goto abort_return;
2783         }
2784
2785         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2786         if (error)
2787                 goto std_return;
2788
2789         /* Fall through to std_return with error = 0. */
2790 std_return:
2791         if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip,
2792                                                 DM_EVENT_POSTLINK)) {
2793                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2794                                 target_dir_vp, DM_RIGHT_NULL,
2795                                 src_vp, DM_RIGHT_NULL,
2796                                 target_name, NULL, 0, error, 0);
2797         }
2798         return error;
2799
2800  abort_return:
2801         cancel_flags |= XFS_TRANS_ABORT;
2802         /* FALLTHROUGH */
2803
2804  error_return:
2805         xfs_trans_cancel(tp, cancel_flags);
2806         goto std_return;
2807 }
2808
2809
2810 /*
2811  * xfs_mkdir
2812  *
2813  */
2814 STATIC int
2815 xfs_mkdir(
2816         bhv_desc_t              *dir_bdp,
2817         bhv_vname_t             *dentry,
2818         bhv_vattr_t             *vap,
2819         bhv_vnode_t             **vpp,
2820         cred_t                  *credp)
2821 {
2822         char                    *dir_name = VNAME(dentry);
2823         xfs_inode_t             *dp;
2824         xfs_inode_t             *cdp;   /* inode of created dir */
2825         bhv_vnode_t             *cvp;   /* vnode of created dir */
2826         xfs_trans_t             *tp;
2827         xfs_mount_t             *mp;
2828         int                     cancel_flags;
2829         int                     error;
2830         int                     committed;
2831         xfs_bmap_free_t         free_list;
2832         xfs_fsblock_t           first_block;
2833         bhv_vnode_t             *dir_vp;
2834         boolean_t               dp_joined_to_trans;
2835         boolean_t               created = B_FALSE;
2836         int                     dm_event_sent = 0;
2837         xfs_prid_t              prid;
2838         struct xfs_dquot        *udqp, *gdqp;
2839         uint                    resblks;
2840         int                     dm_di_mode;
2841         int                     dir_namelen;
2842
2843         dir_vp = BHV_TO_VNODE(dir_bdp);
2844         dp = XFS_BHVTOI(dir_bdp);
2845         mp = dp->i_mount;
2846
2847         if (XFS_FORCED_SHUTDOWN(mp))
2848                 return XFS_ERROR(EIO);
2849
2850         dir_namelen = VNAMELEN(dentry);
2851
2852         tp = NULL;
2853         dp_joined_to_trans = B_FALSE;
2854         dm_di_mode = vap->va_mode;
2855
2856         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
2857                 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2858                                         dir_vp, DM_RIGHT_NULL, NULL,
2859                                         DM_RIGHT_NULL, dir_name, NULL,
2860                                         dm_di_mode, 0, 0);
2861                 if (error)
2862                         return error;
2863                 dm_event_sent = 1;
2864         }
2865
2866         /* Return through std_return after this point. */
2867
2868         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
2869
2870         mp = dp->i_mount;
2871         udqp = gdqp = NULL;
2872         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
2873                 prid = dp->i_d.di_projid;
2874         else if (vap->va_mask & XFS_AT_PROJID)
2875                 prid = (xfs_prid_t)vap->va_projid;
2876         else
2877                 prid = (xfs_prid_t)dfltprid;
2878
2879         /*
2880          * Make sure that we have allocated dquot(s) on disk.
2881          */
2882         error = XFS_QM_DQVOPALLOC(mp, dp,
2883                         current_fsuid(credp), current_fsgid(credp), prid,
2884                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
2885         if (error)
2886                 goto std_return;
2887
2888         tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2889         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2890         resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen);
2891         error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2892                                   XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2893         if (error == ENOSPC) {
2894                 resblks = 0;
2895                 error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0,
2896                                           XFS_TRANS_PERM_LOG_RES,
2897                                           XFS_MKDIR_LOG_COUNT);
2898         }
2899         if (error) {
2900                 cancel_flags = 0;
2901                 dp = NULL;
2902                 goto error_return;
2903         }
2904
2905         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
2906
2907         /*
2908          * Check for directory link count overflow.
2909          */
2910         if (dp->i_d.di_nlink >= XFS_MAXLINK) {
2911                 error = XFS_ERROR(EMLINK);
2912                 goto error_return;
2913         }
2914
2915         /*
2916          * Reserve disk quota and the inode.
2917          */
2918         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
2919         if (error)
2920                 goto error_return;
2921
2922         if (resblks == 0 &&
2923             (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen)))
2924                 goto error_return;
2925         /*
2926          * create the directory inode.
2927          */
2928         error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
2929                         0, credp, prid, resblks > 0,
2930                 &cdp, NULL);
2931         if (error) {
2932                 if (error == ENOSPC)
2933                         goto error_return;
2934                 goto abort_return;
2935         }
2936         ITRACE(cdp);
2937
2938         /*
2939          * Now we add the directory inode to the transaction.
2940          * We waited until now since xfs_dir_ialloc might start
2941          * a new transaction.  Had we joined the transaction
2942          * earlier, the locks might have gotten released.
2943          */
2944         VN_HOLD(dir_vp);
2945         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2946         dp_joined_to_trans = B_TRUE;
2947
2948         XFS_BMAP_INIT(&free_list, &first_block);
2949
2950         error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino,
2951                                    &first_block, &free_list, resblks ?
2952                                    resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2953         if (error) {
2954                 ASSERT(error != ENOSPC);
2955                 goto error1;
2956         }
2957         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2958
2959         /*
2960          * Bump the in memory version number of the parent directory
2961          * so that other processes accessing it will recognize that
2962          * the directory has changed.
2963          */
2964         dp->i_gen++;
2965
2966         error = xfs_dir_init(tp, cdp, dp);
2967         if (error)
2968                 goto error2;
2969
2970         cdp->i_gen = 1;
2971         error = xfs_bumplink(tp, dp);
2972         if (error)
2973                 goto error2;
2974
2975         cvp = XFS_ITOV(cdp);
2976
2977         created = B_TRUE;
2978
2979         *vpp = cvp;
2980         IHOLD(cdp);
2981
2982         /*
2983          * Attach the dquots to the new inode and modify the icount incore.
2984          */
2985         XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp);
2986
2987         /*
2988          * If this is a synchronous mount, make sure that the
2989          * mkdir transaction goes to disk before returning to
2990          * the user.
2991          */
2992         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2993                 xfs_trans_set_sync(tp);
2994         }
2995
2996         error = xfs_bmap_finish(&tp, &free_list, &committed);
2997         if (error) {
2998                 IRELE(cdp);
2999                 goto error2;
3000         }
3001
3002         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3003         XFS_QM_DQRELE(mp, udqp);
3004         XFS_QM_DQRELE(mp, gdqp);
3005         if (error) {
3006                 IRELE(cdp);
3007         }
3008
3009         /* Fall through to std_return with error = 0 or errno from
3010          * xfs_trans_commit. */
3011
3012 std_return:
3013         if ( (created || (error != 0 && dm_event_sent != 0)) &&
3014                         DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3015                                                 DM_EVENT_POSTCREATE)) {
3016                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
3017                                         dir_vp, DM_RIGHT_NULL,
3018                                         created ? XFS_ITOV(cdp):NULL,
3019                                         DM_RIGHT_NULL,
3020                                         dir_name, NULL,
3021                                         dm_di_mode, error, 0);
3022         }
3023         return error;
3024
3025  error2:
3026  error1:
3027         xfs_bmap_cancel(&free_list);
3028  abort_return:
3029         cancel_flags |= XFS_TRANS_ABORT;
3030  error_return:
3031         xfs_trans_cancel(tp, cancel_flags);
3032         XFS_QM_DQRELE(mp, udqp);
3033         XFS_QM_DQRELE(mp, gdqp);
3034
3035         if (!dp_joined_to_trans && (dp != NULL)) {
3036                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3037         }
3038
3039         goto std_return;
3040 }
3041
3042
3043 /*
3044  * xfs_rmdir
3045  *
3046  */
3047 STATIC int
3048 xfs_rmdir(
3049         bhv_desc_t              *dir_bdp,
3050         bhv_vname_t             *dentry,
3051         cred_t                  *credp)
3052 {
3053         char                    *name = VNAME(dentry);
3054         xfs_inode_t             *dp;
3055         xfs_inode_t             *cdp;   /* child directory */
3056         xfs_trans_t             *tp;
3057         xfs_mount_t             *mp;
3058         int                     error;
3059         xfs_bmap_free_t         free_list;
3060         xfs_fsblock_t           first_block;
3061         int                     cancel_flags;
3062         int                     committed;
3063         bhv_vnode_t             *dir_vp;
3064         int                     dm_di_mode = S_IFDIR;
3065         int                     last_cdp_link;
3066         int                     namelen;
3067         uint                    resblks;
3068
3069         dir_vp = BHV_TO_VNODE(dir_bdp);
3070         dp = XFS_BHVTOI(dir_bdp);
3071         mp = dp->i_mount;
3072
3073         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3074
3075         if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount))
3076                 return XFS_ERROR(EIO);
3077         namelen = VNAMELEN(dentry);
3078
3079         if (!xfs_get_dir_entry(dentry, &cdp)) {
3080                 dm_di_mode = cdp->i_d.di_mode;
3081                 IRELE(cdp);
3082         }
3083
3084         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) {
3085                 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
3086                                         dir_vp, DM_RIGHT_NULL,
3087                                         NULL, DM_RIGHT_NULL,
3088                                         name, NULL, dm_di_mode, 0, 0);
3089                 if (error)
3090                         return XFS_ERROR(error);
3091         }
3092
3093         /* Return through std_return after this point. */
3094
3095         cdp = NULL;
3096
3097         /*
3098          * We need to get a reference to cdp before we get our log
3099          * reservation.  The reason for this is that we cannot call
3100          * xfs_iget for an inode for which we do not have a reference
3101          * once we've acquired a log reservation.  This is because the
3102          * inode we are trying to get might be in xfs_inactive going
3103          * for a log reservation.  Since we'll have to wait for the
3104          * inactive code to complete before returning from xfs_iget,
3105          * we need to make sure that we don't have log space reserved
3106          * when we call xfs_iget.  Instead we get an unlocked reference
3107          * to the inode before getting our log reservation.
3108          */
3109         error = xfs_get_dir_entry(dentry, &cdp);
3110         if (error) {
3111                 REMOVE_DEBUG_TRACE(__LINE__);
3112                 goto std_return;
3113         }
3114         mp = dp->i_mount;
3115         dm_di_mode = cdp->i_d.di_mode;
3116
3117         /*
3118          * Get the dquots for the inodes.
3119          */
3120         error = XFS_QM_DQATTACH(mp, dp, 0);
3121         if (!error && dp != cdp)
3122                 error = XFS_QM_DQATTACH(mp, cdp, 0);
3123         if (error) {
3124                 IRELE(cdp);
3125                 REMOVE_DEBUG_TRACE(__LINE__);
3126                 goto std_return;
3127         }
3128
3129         tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
3130         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3131         /*
3132          * We try to get the real space reservation first,
3133          * allowing for directory btree deletion(s) implying
3134          * possible bmap insert(s).  If we can't get the space
3135          * reservation then we use 0 instead, and avoid the bmap
3136          * btree insert(s) in the directory code by, if the bmap
3137          * insert tries to happen, instead trimming the LAST
3138          * block from the directory.
3139          */
3140         resblks = XFS_REMOVE_SPACE_RES(mp);
3141         error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
3142                         XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3143         if (error == ENOSPC) {
3144                 resblks = 0;
3145                 error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
3146                                 XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
3147         }
3148         if (error) {
3149                 ASSERT(error != ENOSPC);
3150                 cancel_flags = 0;
3151                 IRELE(cdp);
3152                 goto error_return;
3153         }
3154         XFS_BMAP_INIT(&free_list, &first_block);
3155
3156         /*
3157          * Now lock the child directory inode and the parent directory
3158          * inode in the proper order.  This will take care of validating
3159          * that the directory entry for the child directory inode has
3160          * not changed while we were obtaining a log reservation.
3161          */
3162         error = xfs_lock_dir_and_entry(dp, cdp);
3163         if (error) {
3164                 xfs_trans_cancel(tp, cancel_flags);
3165                 IRELE(cdp);
3166                 goto std_return;
3167         }
3168
3169         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3170         if (dp != cdp) {
3171                 /*
3172                  * Only increment the parent directory vnode count if
3173                  * we didn't bump it in looking up cdp.  The only time
3174                  * we don't bump it is when we're looking up ".".
3175                  */
3176                 VN_HOLD(dir_vp);
3177         }
3178
3179         ITRACE(cdp);
3180         xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
3181
3182         ASSERT(cdp->i_d.di_nlink >= 2);
3183         if (cdp->i_d.di_nlink != 2) {
3184                 error = XFS_ERROR(ENOTEMPTY);
3185                 goto error_return;
3186         }
3187         if (!xfs_dir_isempty(cdp)) {
3188                 error = XFS_ERROR(ENOTEMPTY);
3189                 goto error_return;
3190         }
3191
3192         error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino,
3193                                         &first_block, &free_list, resblks);
3194         if (error)
3195                 goto error1;
3196
3197         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3198
3199         /*
3200          * Bump the in memory generation count on the parent
3201          * directory so that other can know that it has changed.
3202          */
3203         dp->i_gen++;
3204
3205         /*
3206          * Drop the link from cdp's "..".
3207          */
3208         error = xfs_droplink(tp, dp);
3209         if (error) {
3210                 goto error1;
3211         }
3212
3213         /*
3214          * Drop the link from dp to cdp.
3215          */
3216         error = xfs_droplink(tp, cdp);
3217         if (error) {
3218                 goto error1;
3219         }
3220
3221         /*
3222          * Drop the "." link from cdp to self.
3223          */
3224         error = xfs_droplink(tp, cdp);
3225         if (error) {
3226                 goto error1;
3227         }
3228
3229         /* Determine these before committing transaction */
3230         last_cdp_link = (cdp)->i_d.di_nlink==0;
3231
3232         /*
3233          * Take an extra ref on the child vnode so that it
3234          * does not go to xfs_inactive() from within the commit.
3235          */
3236         IHOLD(cdp);
3237
3238         /*
3239          * If this is a synchronous mount, make sure that the
3240          * rmdir transaction goes to disk before returning to
3241          * the user.
3242          */
3243         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3244                 xfs_trans_set_sync(tp);
3245         }
3246
3247         error = xfs_bmap_finish (&tp, &free_list, &committed);
3248         if (error) {
3249                 xfs_bmap_cancel(&free_list);
3250                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
3251                                  XFS_TRANS_ABORT));
3252                 IRELE(cdp);
3253                 goto std_return;
3254         }
3255
3256         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3257         if (error) {
3258                 IRELE(cdp);
3259                 goto std_return;
3260         }
3261
3262
3263         /*
3264          * Let interposed file systems know about removed links.
3265          */
3266         bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link);
3267
3268         IRELE(cdp);
3269
3270         /* Fall through to std_return with error = 0 or the errno
3271          * from xfs_trans_commit. */
3272  std_return:
3273         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) {
3274                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3275                                         dir_vp, DM_RIGHT_NULL,
3276                                         NULL, DM_RIGHT_NULL,
3277                                         name, NULL, dm_di_mode,
3278                                         error, 0);
3279         }
3280         return error;
3281
3282  error1:
3283         xfs_bmap_cancel(&free_list);
3284         cancel_flags |= XFS_TRANS_ABORT;
3285         /* FALLTHROUGH */
3286
3287  error_return:
3288         xfs_trans_cancel(tp, cancel_flags);
3289         goto std_return;
3290 }
3291
3292
3293 /*
3294  * Read dp's entries starting at uiop->uio_offset and translate them into
3295  * bufsize bytes worth of struct dirents starting at bufbase.
3296  */
3297 STATIC int
3298 xfs_readdir(
3299         bhv_desc_t      *dir_bdp,
3300         uio_t           *uiop,
3301         cred_t          *credp,
3302         int             *eofp)
3303 {
3304         xfs_inode_t     *dp;
3305         xfs_trans_t     *tp = NULL;
3306         int             error = 0;
3307         uint            lock_mode;
3308
3309         vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__,
3310                                                (inst_t *)__return_address);
3311         dp = XFS_BHVTOI(dir_bdp);
3312
3313         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
3314                 return XFS_ERROR(EIO);
3315
3316         lock_mode = xfs_ilock_map_shared(dp);
3317         error = xfs_dir_getdents(tp, dp, uiop, eofp);
3318         xfs_iunlock_map_shared(dp, lock_mode);
3319         return error;
3320 }
3321
3322
3323 STATIC int
3324 xfs_symlink(
3325         bhv_desc_t              *dir_bdp,
3326         bhv_vname_t             *dentry,
3327         bhv_vattr_t             *vap,
3328         char                    *target_path,
3329         bhv_vnode_t             **vpp,
3330         cred_t                  *credp)
3331 {
3332         xfs_trans_t             *tp;
3333         xfs_mount_t             *mp;
3334         xfs_inode_t             *dp;
3335         xfs_inode_t             *ip;
3336         int                     error;
3337         int                     pathlen;
3338         xfs_bmap_free_t         free_list;
3339         xfs_fsblock_t           first_block;
3340         boolean_t               dp_joined_to_trans;
3341         bhv_vnode_t             *dir_vp;
3342         uint                    cancel_flags;
3343         int                     committed;
3344         xfs_fileoff_t           first_fsb;
3345         xfs_filblks_t           fs_blocks;
3346         int                     nmaps;
3347         xfs_bmbt_irec_t         mval[SYMLINK_MAPS];
3348         xfs_daddr_t             d;
3349         char                    *cur_chunk;
3350         int                     byte_cnt;
3351         int                     n;
3352         xfs_buf_t               *bp;
3353         xfs_prid_t              prid;
3354         struct xfs_dquot        *udqp, *gdqp;
3355         uint                    resblks;
3356         char                    *link_name = VNAME(dentry);
3357         int                     link_namelen;
3358
3359         *vpp = NULL;
3360         dir_vp = BHV_TO_VNODE(dir_bdp);
3361         dp = XFS_BHVTOI(dir_bdp);
3362         dp_joined_to_trans = B_FALSE;
3363         error = 0;
3364         ip = NULL;
3365         tp = NULL;
3366
3367         vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address);
3368
3369         mp = dp->i_mount;
3370
3371         if (XFS_FORCED_SHUTDOWN(mp))
3372                 return XFS_ERROR(EIO);
3373
3374         link_namelen = VNAMELEN(dentry);
3375
3376         /*
3377          * Check component lengths of the target path name.
3378          */
3379         pathlen = strlen(target_path);
3380         if (pathlen >= MAXPATHLEN)      /* total string too long */
3381                 return XFS_ERROR(ENAMETOOLONG);
3382         if (pathlen >= MAXNAMELEN) {    /* is any component too long? */
3383                 int len, total;
3384                 char *path;
3385
3386                 for (total = 0, path = target_path; total < pathlen;) {
3387                         /*
3388                          * Skip any slashes.
3389                          */
3390                         while(*path == '/') {
3391                                 total++;
3392                                 path++;
3393                         }
3394
3395                         /*
3396                          * Count up to the next slash or end of path.
3397                          * Error out if the component is bigger than MAXNAMELEN.
3398                          */
3399                         for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3400                                 if (++len >= MAXNAMELEN) {
3401                                         error = ENAMETOOLONG;
3402                                         return error;
3403                                 }
3404                         }
3405                 }
3406         }
3407
3408         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) {
3409                 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp,
3410                                         DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3411                                         link_name, target_path, 0, 0, 0);
3412                 if (error)
3413                         return error;
3414         }
3415
3416         /* Return through std_return after this point. */
3417
3418         udqp = gdqp = NULL;
3419         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
3420                 prid = dp->i_d.di_projid;
3421         else if (vap->va_mask & XFS_AT_PROJID)
3422                 prid = (xfs_prid_t)vap->va_projid;
3423         else
3424                 prid = (xfs_prid_t)dfltprid;
3425
3426         /*
3427          * Make sure that we have allocated dquot(s) on disk.
3428          */
3429         error = XFS_QM_DQVOPALLOC(mp, dp,
3430                         current_fsuid(credp), current_fsgid(credp), prid,
3431                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
3432         if (error)
3433                 goto std_return;
3434
3435         tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
3436         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
3437         /*
3438          * The symlink will fit into the inode data fork?
3439          * There can't be any attributes so we get the whole variable part.
3440          */
3441         if (pathlen <= XFS_LITINO(mp))
3442                 fs_blocks = 0;
3443         else
3444                 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3445         resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks);
3446         error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3447                         XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3448         if (error == ENOSPC && fs_blocks == 0) {
3449                 resblks = 0;
3450                 error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0,
3451                                 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3452         }
3453         if (error) {
3454                 cancel_flags = 0;
3455                 dp = NULL;
3456                 goto error_return;
3457         }
3458
3459         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
3460
3461         /*
3462          * Check whether the directory allows new symlinks or not.
3463          */
3464         if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
3465                 error = XFS_ERROR(EPERM);
3466                 goto error_return;
3467         }
3468
3469         /*
3470          * Reserve disk quota : blocks and inode.
3471          */
3472         error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0);
3473         if (error)
3474                 goto error_return;
3475
3476         /*
3477          * Check for ability to enter directory entry, if no space reserved.
3478          */
3479         if (resblks == 0 &&
3480             (error = xfs_dir_canenter(tp, dp, link_name, link_namelen)))
3481                 goto error_return;
3482         /*
3483          * Initialize the bmap freelist prior to calling either
3484          * bmapi or the directory create code.
3485          */
3486         XFS_BMAP_INIT(&free_list, &first_block);
3487
3488         /*
3489          * Allocate an inode for the symlink.
3490          */
3491         error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT),
3492                                1, 0, credp, prid, resblks > 0, &ip, NULL);
3493         if (error) {
3494                 if (error == ENOSPC)
3495                         goto error_return;
3496                 goto error1;
3497         }
3498         ITRACE(ip);
3499
3500         VN_HOLD(dir_vp);
3501         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3502         dp_joined_to_trans = B_TRUE;
3503
3504         /*
3505          * Also attach the dquot(s) to it, if applicable.
3506          */
3507         XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp);
3508
3509         if (resblks)
3510                 resblks -= XFS_IALLOC_SPACE_RES(mp);
3511         /*
3512          * If the symlink will fit into the inode, write it inline.
3513          */
3514         if (pathlen <= XFS_IFORK_DSIZE(ip)) {
3515                 xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK);
3516                 memcpy(ip->i_df.if_u1.if_data, target_path, pathlen);
3517                 ip->i_d.di_size = pathlen;
3518
3519                 /*
3520                  * The inode was initially created in extent format.
3521                  */
3522                 ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT);
3523                 ip->i_df.if_flags |= XFS_IFINLINE;
3524
3525                 ip->i_d.di_format = XFS_DINODE_FMT_LOCAL;
3526                 xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
3527
3528         } else {
3529                 first_fsb = 0;
3530                 nmaps = SYMLINK_MAPS;
3531
3532                 error = xfs_bmapi(tp, ip, first_fsb, fs_blocks,
3533                                   XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
3534                                   &first_block, resblks, mval, &nmaps,
3535                                   &free_list, NULL);
3536                 if (error) {
3537                         goto error1;
3538                 }
3539
3540                 if (resblks)
3541                         resblks -= fs_blocks;
3542                 ip->i_d.di_size = pathlen;
3543                 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3544
3545                 cur_chunk = target_path;
3546                 for (n = 0; n < nmaps; n++) {
3547                         d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
3548                         byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
3549                         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
3550                                                BTOBB(byte_cnt), 0);
3551                         ASSERT(bp && !XFS_BUF_GETERROR(bp));
3552                         if (pathlen < byte_cnt) {
3553                                 byte_cnt = pathlen;
3554                         }
3555                         pathlen -= byte_cnt;
3556
3557                         memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt);
3558                         cur_chunk += byte_cnt;
3559
3560                         xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1);
3561                 }
3562         }
3563
3564         /*
3565          * Create the directory entry for the symlink.
3566          */
3567         error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino,
3568                                    &first_block, &free_list, resblks);
3569         if (error)
3570                 goto error1;
3571         xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3572         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
3573
3574         /*
3575          * Bump the in memory version number of the parent directory
3576          * so that other processes accessing it will recognize that
3577          * the directory has changed.
3578          */
3579         dp->i_gen++;
3580
3581         /*
3582          * If this is a synchronous mount, make sure that the
3583          * symlink transaction goes to disk before returning to
3584          * the user.
3585          */
3586         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
3587                 xfs_trans_set_sync(tp);
3588         }
3589
3590         /*
3591          * xfs_trans_commit normally decrements the vnode ref count
3592          * when it unlocks the inode. Since we want to return the
3593          * vnode to the caller, we bump the vnode ref count now.
3594          */
3595         IHOLD(ip);
3596
3597         error = xfs_bmap_finish(&tp, &free_list, &committed);
3598         if (error) {
3599                 goto error2;
3600         }
3601         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3602         XFS_QM_DQRELE(mp, udqp);
3603         XFS_QM_DQRELE(mp, gdqp);
3604
3605         /* Fall through to std_return with error = 0 or errno from
3606          * xfs_trans_commit     */
3607 std_return:
3608         if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp),
3609                              DM_EVENT_POSTSYMLINK)) {
3610                 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3611                                         dir_vp, DM_RIGHT_NULL,
3612                                         error ? NULL : XFS_ITOV(ip),
3613                                         DM_RIGHT_NULL, link_name, target_path,
3614                                         0, error, 0);
3615         }
3616
3617         if (!error) {
3618                 bhv_vnode_t *vp;
3619
3620                 ASSERT(ip);
3621                 vp = XFS_ITOV(ip);
3622                 *vpp = vp;
3623         }
3624         return error;
3625
3626  error2:
3627         IRELE(ip);
3628  error1:
3629         xfs_bmap_cancel(&free_list);
3630         cancel_flags |= XFS_TRANS_ABORT;
3631  error_return:
3632         xfs_trans_cancel(tp, cancel_flags);
3633         XFS_QM_DQRELE(mp, udqp);
3634         XFS_QM_DQRELE(mp, gdqp);
3635
3636         if (!dp_joined_to_trans && (dp != NULL)) {
3637                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
3638         }
3639
3640         goto std_return;
3641 }
3642
3643
3644 /*
3645  * xfs_fid2
3646  *
3647  * A fid routine that takes a pointer to a previously allocated
3648  * fid structure (like xfs_fast_fid) but uses a 64 bit inode number.
3649  */
3650 STATIC int
3651 xfs_fid2(
3652         bhv_desc_t      *bdp,
3653         fid_t           *fidp)
3654 {
3655         xfs_inode_t     *ip;
3656         xfs_fid2_t      *xfid;
3657
3658         vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__,
3659                                        (inst_t *)__return_address);
3660         ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t));
3661
3662         xfid = (xfs_fid2_t *)fidp;
3663         ip = XFS_BHVTOI(bdp);
3664         xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len);
3665         xfid->fid_pad = 0;
3666         /*
3667          * use memcpy because the inode is a long long and there's no
3668          * assurance that xfid->fid_ino is properly aligned.
3669          */
3670         memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino));
3671         xfid->fid_gen = ip->i_d.di_gen;
3672
3673         return 0;
3674 }
3675
3676
3677 /*
3678  * xfs_rwlock
3679  */
3680 int
3681 xfs_rwlock(
3682         bhv_desc_t      *bdp,
3683         bhv_vrwlock_t   locktype)
3684 {
3685         xfs_inode_t     *ip;
3686         bhv_vnode_t     *vp;
3687
3688         vp = BHV_TO_VNODE(bdp);
3689         if (VN_ISDIR(vp))
3690                 return 1;
3691         ip = XFS_BHVTOI(bdp);
3692         if (locktype == VRWLOCK_WRITE) {
3693                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3694         } else if (locktype == VRWLOCK_TRY_READ) {
3695                 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3696         } else if (locktype == VRWLOCK_TRY_WRITE) {
3697                 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3698         } else {
3699                 ASSERT((locktype == VRWLOCK_READ) ||
3700                        (locktype == VRWLOCK_WRITE_DIRECT));
3701                 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3702         }
3703
3704         return 1;
3705 }
3706
3707
3708 /*
3709  * xfs_rwunlock
3710  */
3711 void
3712 xfs_rwunlock(
3713         bhv_desc_t      *bdp,
3714         bhv_vrwlock_t   locktype)
3715 {
3716         xfs_inode_t     *ip;
3717         bhv_vnode_t     *vp;
3718
3719         vp = BHV_TO_VNODE(bdp);
3720         if (VN_ISDIR(vp))
3721                 return;
3722         ip = XFS_BHVTOI(bdp);
3723         if (locktype == VRWLOCK_WRITE) {
3724                 /*
3725                  * In the write case, we may have added a new entry to
3726                  * the reference cache.  This might store a pointer to
3727                  * an inode to be released in this inode.  If it is there,
3728                  * clear the pointer and release the inode after unlocking
3729                  * this one.
3730                  */
3731                 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3732         } else {
3733                 ASSERT((locktype == VRWLOCK_READ) ||
3734                        (locktype == VRWLOCK_WRITE_DIRECT));
3735                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3736         }
3737         return;
3738 }
3739
3740 STATIC int
3741 xfs_inode_flush(
3742         bhv_desc_t      *bdp,
3743         int             flags)
3744 {
3745         xfs_inode_t     *ip;
3746         xfs_mount_t     *mp;
3747         xfs_inode_log_item_t *iip;
3748         int             error = 0;
3749
3750         ip = XFS_BHVTOI(bdp);
3751         mp = ip->i_mount;
3752         iip = ip->i_itemp;
3753
3754         if (XFS_FORCED_SHUTDOWN(mp))
3755                 return XFS_ERROR(EIO);
3756
3757         /*
3758          * Bypass inodes which have already been cleaned by
3759          * the inode flush clustering code inside xfs_iflush
3760          */
3761         if ((ip->i_update_core == 0) &&
3762             ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3763                 return 0;
3764
3765         if (flags & FLUSH_LOG) {
3766                 if (iip && iip->ili_last_lsn) {
3767                         xlog_t          *log = mp->m_log;
3768                         xfs_lsn_t       sync_lsn;
3769                         int             s, log_flags = XFS_LOG_FORCE;
3770
3771                         s = GRANT_LOCK(log);
3772                         sync_lsn = log->l_last_sync_lsn;
3773                         GRANT_UNLOCK(log, s);
3774
3775                         if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
3776                                 if (flags & FLUSH_SYNC)
3777                                         log_flags |= XFS_LOG_SYNC;
3778                                 error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3779                                 if (error)
3780                                         return error;
3781                         }
3782
3783                         if (ip->i_update_core == 0)
3784                                 return 0;
3785                 }
3786         }
3787
3788         /*
3789          * We make this non-blocking if the inode is contended,
3790          * return EAGAIN to indicate to the caller that they
3791          * did not succeed. This prevents the flush path from
3792          * blocking on inodes inside another operation right
3793          * now, they get caught later by xfs_sync.
3794          */
3795         if (flags & FLUSH_INODE) {
3796                 int     flush_flags;
3797
3798                 if (flags & FLUSH_SYNC) {
3799                         xfs_ilock(ip, XFS_ILOCK_SHARED);
3800                         xfs_iflock(ip);
3801                 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3802                         if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3803                                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3804                                 return EAGAIN;
3805                         }
3806                 } else {
3807                         return EAGAIN;
3808                 }
3809
3810                 if (flags & FLUSH_SYNC)
3811                         flush_flags = XFS_IFLUSH_SYNC;
3812                 else
3813                         flush_flags = XFS_IFLUSH_ASYNC;
3814
3815                 error = xfs_iflush(ip, flush_flags);
3816                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3817         }
3818
3819         return error;
3820 }
3821
3822 int
3823 xfs_set_dmattrs (
3824         bhv_desc_t      *bdp,
3825         u_int           evmask,
3826         u_int16_t       state,
3827         cred_t          *credp)
3828 {
3829         xfs_inode_t     *ip;
3830         xfs_trans_t     *tp;
3831         xfs_mount_t     *mp;
3832         int             error;
3833
3834         if (!capable(CAP_SYS_ADMIN))
3835                 return XFS_ERROR(EPERM);
3836
3837         ip = XFS_BHVTOI(bdp);
3838         mp = ip->i_mount;
3839
3840         if (XFS_FORCED_SHUTDOWN(mp))
3841                 return XFS_ERROR(EIO);
3842
3843         tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
3844         error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0);
3845         if (error) {
3846                 xfs_trans_cancel(tp, 0);
3847                 return error;
3848         }
3849         xfs_ilock(ip, XFS_ILOCK_EXCL);
3850         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
3851
3852         ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask;
3853         ip->i_iocore.io_dmstate  = ip->i_d.di_dmstate  = state;
3854
3855         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
3856         IHOLD(ip);
3857         error = xfs_trans_commit(tp, 0);
3858
3859         return error;
3860 }
3861
3862 STATIC int
3863 xfs_reclaim(
3864         bhv_desc_t      *bdp)
3865 {
3866         xfs_inode_t     *ip;
3867         bhv_vnode_t     *vp;
3868
3869         vp = BHV_TO_VNODE(bdp);
3870         ip = XFS_BHVTOI(bdp);
3871
3872         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
3873
3874         ASSERT(!VN_MAPPED(vp));
3875
3876         /* bad inode, get out here ASAP */
3877         if (VN_BAD(vp)) {
3878                 xfs_ireclaim(ip);
3879                 return 0;
3880         }
3881
3882         vn_iowait(vp);
3883
3884         ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
3885
3886         /*
3887          * Make sure the atime in the XFS inode is correct before freeing the
3888          * Linux inode.
3889          */
3890         xfs_synchronize_atime(ip);
3891
3892         /*
3893          * If we have nothing to flush with this inode then complete the
3894          * teardown now, otherwise break the link between the xfs inode and the
3895          * linux inode and clean up the xfs inode later. This avoids flushing
3896          * the inode to disk during the delete operation itself.
3897          *
3898          * When breaking the link, we need to set the XFS_IRECLAIMABLE flag
3899          * first to ensure that xfs_iunpin() will never see an xfs inode
3900          * that has a linux inode being reclaimed. Synchronisation is provided
3901          * by the i_flags_lock.
3902          */
3903         if (!ip->i_update_core && (ip->i_itemp == NULL)) {
3904                 xfs_ilock(ip, XFS_ILOCK_EXCL);
3905                 xfs_iflock(ip);
3906                 return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
3907         } else {
3908                 xfs_mount_t     *mp = ip->i_mount;
3909
3910                 /* Protect sync and unpin from us */
3911                 XFS_MOUNT_ILOCK(mp);
3912                 spin_lock(&ip->i_flags_lock);
3913                 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
3914                 vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip));
3915                 spin_unlock(&ip->i_flags_lock);
3916                 list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
3917                 XFS_MOUNT_IUNLOCK(mp);
3918         }
3919         return 0;
3920 }
3921
3922 int
3923 xfs_finish_reclaim(
3924         xfs_inode_t     *ip,
3925         int             locked,
3926         int             sync_mode)
3927 {
3928         xfs_ihash_t     *ih = ip->i_hash;
3929         bhv_vnode_t     *vp = XFS_ITOV_NULL(ip);
3930         int             error;
3931
3932         if (vp && VN_BAD(vp))
3933                 goto reclaim;
3934
3935         /* The hash lock here protects a thread in xfs_iget_core from
3936          * racing with us on linking the inode back with a vnode.
3937          * Once we have the XFS_IRECLAIM flag set it will not touch
3938          * us.
3939          */
3940         write_lock(&ih->ih_lock);
3941         spin_lock(&ip->i_flags_lock);
3942         if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
3943             (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
3944                 spin_unlock(&ip->i_flags_lock);
3945                 write_unlock(&ih->ih_lock);
3946                 if (locked) {
3947                         xfs_ifunlock(ip);
3948                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
3949                 }
3950                 return 1;
3951         }
3952         __xfs_iflags_set(ip, XFS_IRECLAIM);
3953         spin_unlock(&ip->i_flags_lock);
3954         write_unlock(&ih->ih_lock);
3955
3956         /*
3957          * If the inode is still dirty, then flush it out.  If the inode
3958          * is not in the AIL, then it will be OK to flush it delwri as
3959          * long as xfs_iflush() does not keep any references to the inode.
3960          * We leave that decision up to xfs_iflush() since it has the
3961          * knowledge of whether it's OK to simply do a delwri flush of
3962          * the inode or whether we need to wait until the inode is
3963          * pulled from the AIL.
3964          * We get the flush lock regardless, though, just to make sure
3965          * we don't free it while it is being flushed.
3966          */
3967         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3968                 if (!locked) {
3969                         xfs_ilock(ip, XFS_ILOCK_EXCL);
3970                         xfs_iflock(ip);
3971                 }
3972
3973                 if (ip->i_update_core ||
3974                     ((ip->i_itemp != NULL) &&
3975                      (ip->i_itemp->ili_format.ilf_fields != 0))) {
3976                         error = xfs_iflush(ip, sync_mode);
3977                         /*
3978                          * If we hit an error, typically because of filesystem
3979                          * shutdown, we don't need to let vn_reclaim to know
3980                          * because we're gonna reclaim the inode anyway.
3981                          */
3982                         if (error) {
3983                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3984                                 goto reclaim;
3985                         }
3986                         xfs_iflock(ip); /* synchronize with xfs_iflush_done */
3987                 }
3988
3989                 ASSERT(ip->i_update_core == 0);
3990                 ASSERT(ip->i_itemp == NULL ||
3991                        ip->i_itemp->ili_format.ilf_fields == 0);
3992                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3993         } else if (locked) {
3994                 /*
3995                  * We are not interested in doing an iflush if we're
3996                  * in the process of shutting down the filesystem forcibly.
3997                  * So, just reclaim the inode.
3998                  */
3999                 xfs_ifunlock(ip);
4000                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4001         }
4002
4003  reclaim:
4004         xfs_ireclaim(ip);
4005         return 0;
4006 }
4007
4008 int
4009 xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
4010 {
4011         int             purged;
4012         xfs_inode_t     *ip, *n;
4013         int             done = 0;
4014
4015         while (!done) {
4016                 purged = 0;
4017                 XFS_MOUNT_ILOCK(mp);
4018                 list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
4019                         if (noblock) {
4020                                 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
4021                                         continue;
4022                                 if (xfs_ipincount(ip) ||
4023                                     !xfs_iflock_nowait(ip)) {
4024                                         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4025                                         continue;
4026                                 }
4027                         }
4028                         XFS_MOUNT_IUNLOCK(mp);
4029                         if (xfs_finish_reclaim(ip, noblock,
4030                                         XFS_IFLUSH_DELWRI_ELSE_ASYNC))
4031                                 delay(1);
4032                         purged = 1;
4033                         break;
4034                 }
4035
4036                 done = !purged;
4037         }
4038
4039         XFS_MOUNT_IUNLOCK(mp);
4040         return 0;
4041 }
4042
4043 /*
4044  * xfs_alloc_file_space()
4045  *      This routine allocates disk space for the given file.
4046  *
4047  *      If alloc_type == 0, this request is for an ALLOCSP type
4048  *      request which will change the file size.  In this case, no
4049  *      DMAPI event will be generated by the call.  A TRUNCATE event
4050  *      will be generated later by xfs_setattr.
4051  *
4052  *      If alloc_type != 0, this request is for a RESVSP type
4053  *      request, and a DMAPI DM_EVENT_WRITE will be generated if the
4054  *      lower block boundary byte address is less than the file's
4055  *      length.
4056  *
4057  * RETURNS:
4058  *       0 on success
4059  *      errno on error
4060  *
4061  */
4062 STATIC int
4063 xfs_alloc_file_space(
4064         xfs_inode_t             *ip,
4065         xfs_off_t               offset,
4066         xfs_off_t               len,
4067         int                     alloc_type,
4068         int                     attr_flags)
4069 {
4070         xfs_mount_t             *mp = ip->i_mount;
4071         xfs_off_t               count;
4072         xfs_filblks_t           allocated_fsb;
4073         xfs_filblks_t           allocatesize_fsb;
4074         xfs_extlen_t            extsz, temp;
4075         xfs_fileoff_t           startoffset_fsb;
4076         xfs_fsblock_t           firstfsb;
4077         int                     nimaps;
4078         int                     bmapi_flag;
4079         int                     quota_flag;
4080         int                     rt;
4081         xfs_trans_t             *tp;
4082         xfs_bmbt_irec_t         imaps[1], *imapp;
4083         xfs_bmap_free_t         free_list;
4084         uint                    qblocks, resblks, resrtextents;
4085         int                     committed;
4086         int                     error;
4087
4088         vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address);
4089
4090         if (XFS_FORCED_SHUTDOWN(mp))
4091                 return XFS_ERROR(EIO);
4092
4093         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4094                 return error;
4095
4096         if (len <= 0)
4097                 return XFS_ERROR(EINVAL);
4098
4099         rt = XFS_IS_REALTIME_INODE(ip);
4100         extsz = xfs_get_extsz_hint(ip);
4101
4102         count = len;
4103         imapp = &imaps[0];
4104         nimaps = 1;
4105         bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0);
4106         startoffset_fsb = XFS_B_TO_FSBT(mp, offset);
4107         allocatesize_fsb = XFS_B_TO_FSB(mp, count);
4108
4109         /*      Generate a DMAPI event if needed.       */
4110         if (alloc_type != 0 && offset < ip->i_size &&
4111                         (attr_flags&ATTR_DMI) == 0  &&
4112                         DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4113                 xfs_off_t           end_dmi_offset;
4114
4115                 end_dmi_offset = offset+len;
4116                 if (end_dmi_offset > ip->i_size)
4117                         end_dmi_offset = ip->i_size;
4118                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip),
4119                         offset, end_dmi_offset - offset,
4120                         0, NULL);
4121                 if (error)
4122                         return error;
4123         }
4124
4125         /*
4126          * Allocate file space until done or until there is an error
4127          */
4128 retry:
4129         while (allocatesize_fsb && !error) {
4130                 xfs_fileoff_t   s, e;
4131
4132                 /*
4133                  * Determine space reservations for data/realtime.
4134                  */
4135                 if (unlikely(extsz)) {
4136                         s = startoffset_fsb;
4137                         do_div(s, extsz);
4138                         s *= extsz;
4139                         e = startoffset_fsb + allocatesize_fsb;
4140                         if ((temp = do_mod(startoffset_fsb, extsz)))
4141                                 e += temp;
4142                         if ((temp = do_mod(e, extsz)))
4143                                 e += extsz - temp;
4144                 } else {
4145                         s = 0;
4146                         e = allocatesize_fsb;
4147                 }
4148
4149                 if (unlikely(rt)) {
4150                         resrtextents = qblocks = (uint)(e - s);
4151                         resrtextents /= mp->m_sb.sb_rextsize;
4152                         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4153                         quota_flag = XFS_QMOPT_RES_RTBLKS;
4154                 } else {
4155                         resrtextents = 0;
4156                         resblks = qblocks = \
4157                                 XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
4158                         quota_flag = XFS_QMOPT_RES_REGBLKS;
4159                 }
4160
4161                 /*
4162                  * Allocate and setup the transaction.
4163                  */
4164                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4165                 error = xfs_trans_reserve(tp, resblks,
4166                                           XFS_WRITE_LOG_RES(mp), resrtextents,
4167                                           XFS_TRANS_PERM_LOG_RES,
4168                                           XFS_WRITE_LOG_COUNT);
4169                 /*
4170                  * Check for running out of space
4171                  */
4172                 if (error) {
4173                         /*
4174                          * Free the transaction structure.
4175                          */
4176                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4177                         xfs_trans_cancel(tp, 0);
4178                         break;
4179                 }
4180                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4181                 error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip,
4182                                                       qblocks, 0, quota_flag);
4183                 if (error)
4184                         goto error1;
4185
4186                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4187                 xfs_trans_ihold(tp, ip);
4188
4189                 /*
4190                  * Issue the xfs_bmapi() call to allocate the blocks
4191                  */
4192                 XFS_BMAP_INIT(&free_list, &firstfsb);
4193                 error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4194                                   allocatesize_fsb, bmapi_flag,
4195                                   &firstfsb, 0, imapp, &nimaps,
4196                                   &free_list, NULL);
4197                 if (error) {
4198                         goto error0;
4199                 }
4200
4201                 /*
4202                  * Complete the transaction
4203                  */
4204                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4205                 if (error) {
4206                         goto error0;
4207                 }
4208
4209                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4210                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4211                 if (error) {
4212                         break;
4213                 }
4214
4215                 allocated_fsb = imapp->br_blockcount;
4216
4217                 if (nimaps == 0) {
4218                         error = XFS_ERROR(ENOSPC);
4219                         break;
4220                 }
4221
4222                 startoffset_fsb += allocated_fsb;
4223                 allocatesize_fsb -= allocated_fsb;
4224         }
4225 dmapi_enospc_check:
4226         if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 &&
4227             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) {
4228
4229                 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
4230                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4231                                 XFS_ITOV(ip), DM_RIGHT_NULL,
4232                                 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
4233                 if (error == 0)
4234                         goto retry;     /* Maybe DMAPI app. has made space */
4235                 /* else fall through with error from XFS_SEND_DATA */
4236         }
4237
4238         return error;
4239
4240 error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
4241         xfs_bmap_cancel(&free_list);
4242         XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag);
4243
4244 error1: /* Just cancel transaction */
4245         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4246         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4247         goto dmapi_enospc_check;
4248 }
4249
4250 /*
4251  * Zero file bytes between startoff and endoff inclusive.
4252  * The iolock is held exclusive and no blocks are buffered.
4253  */
4254 STATIC int
4255 xfs_zero_remaining_bytes(
4256         xfs_inode_t             *ip,
4257         xfs_off_t               startoff,
4258         xfs_off_t               endoff)
4259 {
4260         xfs_bmbt_irec_t         imap;
4261         xfs_fileoff_t           offset_fsb;
4262         xfs_off_t               lastoffset;
4263         xfs_off_t               offset;
4264         xfs_buf_t               *bp;
4265         xfs_mount_t             *mp = ip->i_mount;
4266         int                     nimap;
4267         int                     error = 0;
4268
4269         bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
4270                                 ip->i_d.di_flags & XFS_DIFLAG_REALTIME ?
4271                                 mp->m_rtdev_targp : mp->m_ddev_targp);
4272
4273         for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
4274                 offset_fsb = XFS_B_TO_FSBT(mp, offset);
4275                 nimap = 1;
4276                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0,
4277                         NULL, 0, &imap, &nimap, NULL, NULL);
4278                 if (error || nimap < 1)
4279                         break;
4280                 ASSERT(imap.br_blockcount >= 1);
4281                 ASSERT(imap.br_startoff == offset_fsb);
4282                 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
4283                 if (lastoffset > endoff)
4284                         lastoffset = endoff;
4285                 if (imap.br_startblock == HOLESTARTBLOCK)
4286                         continue;
4287                 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4288                 if (imap.br_state == XFS_EXT_UNWRITTEN)
4289                         continue;
4290                 XFS_BUF_UNDONE(bp);
4291                 XFS_BUF_UNWRITE(bp);
4292                 XFS_BUF_READ(bp);
4293                 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4294                 xfsbdstrat(mp, bp);
4295                 if ((error = xfs_iowait(bp))) {
4296                         xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4297                                           mp, bp, XFS_BUF_ADDR(bp));
4298                         break;
4299                 }
4300                 memset(XFS_BUF_PTR(bp) +
4301                         (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
4302                       0, lastoffset - offset + 1);
4303                 XFS_BUF_UNDONE(bp);
4304                 XFS_BUF_UNREAD(bp);
4305                 XFS_BUF_WRITE(bp);
4306                 xfsbdstrat(mp, bp);
4307                 if ((error = xfs_iowait(bp))) {
4308                         xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4309                                           mp, bp, XFS_BUF_ADDR(bp));
4310                         break;
4311                 }
4312         }
4313         xfs_buf_free(bp);
4314         return error;
4315 }
4316
4317 /*
4318  * xfs_free_file_space()
4319  *      This routine frees disk space for the given file.
4320  *
4321  *      This routine is only called by xfs_change_file_space
4322  *      for an UNRESVSP type call.
4323  *
4324  * RETURNS:
4325  *       0 on success
4326  *      errno on error
4327  *
4328  */
4329 STATIC int
4330 xfs_free_file_space(
4331         xfs_inode_t             *ip,
4332         xfs_off_t               offset,
4333         xfs_off_t               len,
4334         int                     attr_flags)
4335 {
4336         bhv_vnode_t             *vp;
4337         int                     committed;
4338         int                     done;
4339         xfs_off_t               end_dmi_offset;
4340         xfs_fileoff_t           endoffset_fsb;
4341         int                     error;
4342         xfs_fsblock_t           firstfsb;
4343         xfs_bmap_free_t         free_list;
4344         xfs_bmbt_irec_t         imap;
4345         xfs_off_t               ioffset;
4346         xfs_extlen_t            mod=0;
4347         xfs_mount_t             *mp;
4348         int                     nimap;
4349         uint                    resblks;
4350         uint                    rounding;
4351         int                     rt;
4352         xfs_fileoff_t           startoffset_fsb;
4353         xfs_trans_t             *tp;
4354         int                     need_iolock = 1;
4355
4356         vp = XFS_ITOV(ip);
4357         mp = ip->i_mount;
4358
4359         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4360
4361         if ((error = XFS_QM_DQATTACH(mp, ip, 0)))
4362                 return error;
4363
4364         error = 0;
4365         if (len <= 0)   /* if nothing being freed */
4366                 return error;
4367         rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME);
4368         startoffset_fsb = XFS_B_TO_FSB(mp, offset);
4369         end_dmi_offset = offset + len;
4370         endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
4371
4372         if (offset < ip->i_size &&
4373             (attr_flags & ATTR_DMI) == 0 &&
4374             DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) {
4375                 if (end_dmi_offset > ip->i_size)
4376                         end_dmi_offset = ip->i_size;
4377                 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp,
4378                                 offset, end_dmi_offset - offset,
4379                                 AT_DELAY_FLAG(attr_flags), NULL);
4380                 if (error)
4381                         return error;
4382         }
4383
4384         if (attr_flags & ATTR_NOLOCK)
4385                 need_iolock = 0;
4386         if (need_iolock) {
4387                 xfs_ilock(ip, XFS_IOLOCK_EXCL);
4388                 vn_iowait(vp);  /* wait for the completion of any pending DIOs */
4389         }
4390
4391         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, NBPP);
4392         ioffset = offset & ~(rounding - 1);
4393
4394         if (VN_CACHED(vp) != 0) {
4395                 xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1,
4396                                 ctooff(offtoct(ioffset)), -1);
4397                 error = bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)),
4398                                 -1, FI_REMAPF_LOCKED);
4399                 if (error)
4400                         goto out_unlock_iolock;
4401         }
4402
4403         /*
4404          * Need to zero the stuff we're not freeing, on disk.
4405          * If its a realtime file & can't use unwritten extents then we
4406          * actually need to zero the extent edges.  Otherwise xfs_bunmapi
4407          * will take care of it for us.
4408          */
4409         if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) {
4410                 nimap = 1;
4411                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb,
4412                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4413                 if (error)
4414                         goto out_unlock_iolock;
4415                 ASSERT(nimap == 0 || nimap == 1);
4416                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4417                         xfs_daddr_t     block;
4418
4419                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4420                         block = imap.br_startblock;
4421                         mod = do_div(block, mp->m_sb.sb_rextsize);
4422                         if (mod)
4423                                 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
4424                 }
4425                 nimap = 1;
4426                 error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1,
4427                         1, 0, NULL, 0, &imap, &nimap, NULL, NULL);
4428                 if (error)
4429                         goto out_unlock_iolock;
4430                 ASSERT(nimap == 0 || nimap == 1);
4431                 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
4432                         ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
4433                         mod++;
4434                         if (mod && (mod != mp->m_sb.sb_rextsize))
4435                                 endoffset_fsb -= mod;
4436                 }
4437         }
4438         if ((done = (endoffset_fsb <= startoffset_fsb)))
4439                 /*
4440                  * One contiguous piece to clear
4441                  */
4442                 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
4443         else {
4444                 /*
4445                  * Some full blocks, possibly two pieces to clear
4446                  */
4447                 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
4448                         error = xfs_zero_remaining_bytes(ip, offset,
4449                                 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
4450                 if (!error &&
4451                     XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
4452                         error = xfs_zero_remaining_bytes(ip,
4453                                 XFS_FSB_TO_B(mp, endoffset_fsb),
4454                                 offset + len - 1);
4455         }
4456
4457         /*
4458          * free file space until done or until there is an error
4459          */
4460         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
4461         while (!error && !done) {
4462
4463                 /*
4464                  * allocate and setup the transaction. Allow this
4465                  * transaction to dip into the reserve blocks to ensure
4466                  * the freeing of the space succeeds at ENOSPC.
4467                  */
4468                 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
4469                 tp->t_flags |= XFS_TRANS_RESERVE;
4470                 error = xfs_trans_reserve(tp,
4471                                           resblks,
4472                                           XFS_WRITE_LOG_RES(mp),
4473                                           0,
4474                                           XFS_TRANS_PERM_LOG_RES,
4475                                           XFS_WRITE_LOG_COUNT);
4476
4477                 /*
4478                  * check for running out of space
4479                  */
4480                 if (error) {
4481                         /*
4482                          * Free the transaction structure.
4483                          */
4484                         ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp));
4485                         xfs_trans_cancel(tp, 0);
4486                         break;
4487                 }
4488                 xfs_ilock(ip, XFS_ILOCK_EXCL);
4489                 error = XFS_TRANS_RESERVE_QUOTA(mp, tp,
4490                                 ip->i_udquot, ip->i_gdquot, resblks, 0,
4491                                 XFS_QMOPT_RES_REGBLKS);
4492                 if (error)
4493                         goto error1;
4494
4495                 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4496                 xfs_trans_ihold(tp, ip);
4497
4498                 /*
4499                  * issue the bunmapi() call to free the blocks
4500                  */
4501                 XFS_BMAP_INIT(&free_list, &firstfsb);
4502                 error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb,
4503                                   endoffset_fsb - startoffset_fsb,
4504                                   0, 2, &firstfsb, &free_list, NULL, &done);
4505                 if (error) {
4506                         goto error0;
4507                 }
4508
4509                 /*
4510                  * complete the transaction
4511                  */
4512                 error = xfs_bmap_finish(&tp, &free_list, &committed);
4513                 if (error) {
4514                         goto error0;
4515                 }
4516
4517                 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
4518                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4519         }
4520
4521  out_unlock_iolock:
4522         if (need_iolock)
4523                 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
4524         return error;
4525
4526  error0:
4527         xfs_bmap_cancel(&free_list);
4528  error1:
4529         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
4530         xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
4531                     XFS_ILOCK_EXCL);
4532         return error;
4533 }
4534
4535 /*
4536  * xfs_change_file_space()
4537  *      This routine allocates or frees disk space for the given file.
4538  *      The user specified parameters are checked for alignment and size
4539  *      limitations.
4540  *
4541  * RETURNS:
4542  *       0 on success
4543  *      errno on error
4544  *
4545  */
4546 int
4547 xfs_change_file_space(
4548         bhv_desc_t      *bdp,
4549         int             cmd,
4550         xfs_flock64_t   *bf,
4551         xfs_off_t       offset,
4552         cred_t          *credp,
4553         int             attr_flags)
4554 {
4555         int             clrprealloc;
4556         int             error;
4557         xfs_fsize_t     fsize;
4558         xfs_inode_t     *ip;
4559         xfs_mount_t     *mp;
4560         int             setprealloc;
4561         xfs_off_t       startoffset;
4562         xfs_off_t       llen;
4563         xfs_trans_t     *tp;
4564         bhv_vattr_t     va;
4565         bhv_vnode_t     *vp;
4566
4567         vp = BHV_TO_VNODE(bdp);
4568         vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
4569
4570         ip = XFS_BHVTOI(bdp);
4571         mp = ip->i_mount;
4572
4573         /*
4574          * must be a regular file and have write permission
4575          */
4576         if (!VN_ISREG(vp))
4577                 return XFS_ERROR(EINVAL);
4578
4579         xfs_ilock(ip, XFS_ILOCK_SHARED);
4580
4581         if ((error = xfs_iaccess(ip, S_IWUSR, credp))) {
4582                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
4583                 return error;
4584         }
4585
4586         xfs_iunlock(ip, XFS_ILOCK_SHARED);
4587
4588         switch (bf->l_whence) {
4589         case 0: /*SEEK_SET*/
4590                 break;
4591         case 1: /*SEEK_CUR*/
4592                 bf->l_start += offset;
4593                 break;
4594         case 2: /*SEEK_END*/
4595                 bf->l_start += ip->i_size;
4596                 break;
4597         default:
4598                 return XFS_ERROR(EINVAL);
4599         }
4600
4601         llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
4602
4603         if (   (bf->l_start < 0)
4604             || (bf->l_start > XFS_MAXIOFFSET(mp))
4605             || (bf->l_start + llen < 0)
4606             || (bf->l_start + llen > XFS_MAXIOFFSET(mp)))
4607                 return XFS_ERROR(EINVAL);
4608
4609         bf->l_whence = 0;
4610
4611         startoffset = bf->l_start;
4612         fsize = ip->i_size;
4613
4614         /*
4615          * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
4616          * file space.
4617          * These calls do NOT zero the data space allocated to the file,
4618          * nor do they change the file size.
4619          *
4620          * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
4621          * space.
4622          * These calls cause the new file data to be zeroed and the file
4623          * size to be changed.
4624          */
4625         setprealloc = clrprealloc = 0;
4626
4627         switch (cmd) {
4628         case XFS_IOC_RESVSP:
4629         case XFS_IOC_RESVSP64:
4630                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
4631                                                                 1, attr_flags);
4632                 if (error)
4633                         return error;
4634                 setprealloc = 1;
4635                 break;
4636
4637         case XFS_IOC_UNRESVSP:
4638         case XFS_IOC_UNRESVSP64:
4639                 if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
4640                                                                 attr_flags)))
4641                         return error;
4642                 break;
4643
4644         case XFS_IOC_ALLOCSP:
4645         case XFS_IOC_ALLOCSP64:
4646         case XFS_IOC_FREESP:
4647         case XFS_IOC_FREESP64:
4648                 if (startoffset > fsize) {
4649                         error = xfs_alloc_file_space(ip, fsize,
4650                                         startoffset - fsize, 0, attr_flags);
4651                         if (error)
4652                                 break;
4653                 }
4654
4655                 va.va_mask = XFS_AT_SIZE;
4656                 va.va_size = startoffset;
4657
4658                 error = xfs_setattr(bdp, &va, attr_flags, credp);
4659
4660                 if (error)
4661                         return error;
4662
4663                 clrprealloc = 1;
4664                 break;
4665
4666         default:
4667                 ASSERT(0);
4668                 return XFS_ERROR(EINVAL);
4669         }
4670
4671         /*
4672          * update the inode timestamp, mode, and prealloc flag bits
4673          */
4674         tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
4675
4676         if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp),
4677                                       0, 0, 0))) {
4678                 /* ASSERT(0); */
4679                 xfs_trans_cancel(tp, 0);
4680                 return error;
4681         }
4682
4683         xfs_ilock(ip, XFS_ILOCK_EXCL);
4684
4685         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
4686         xfs_trans_ihold(tp, ip);
4687
4688         if ((attr_flags & ATTR_DMI) == 0) {
4689                 ip->i_d.di_mode &= ~S_ISUID;
4690
4691                 /*
4692                  * Note that we don't have to worry about mandatory
4693                  * file locking being disabled here because we only
4694                  * clear the S_ISGID bit if the Group execute bit is
4695                  * on, but if it was on then mandatory locking wouldn't
4696                  * have been enabled.
4697                  */
4698                 if (ip->i_d.di_mode & S_IXGRP)
4699                         ip->i_d.di_mode &= ~S_ISGID;
4700
4701                 xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
4702         }
4703         if (setprealloc)
4704                 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
4705         else if (clrprealloc)
4706                 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
4707
4708         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
4709         xfs_trans_set_sync(tp);
4710
4711         error = xfs_trans_commit(tp, 0);
4712
4713         xfs_iunlock(ip, XFS_ILOCK_EXCL);
4714
4715         return error;
4716 }
4717
4718 bhv_vnodeops_t xfs_vnodeops = {
4719         BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS),
4720         .vop_open               = xfs_open,
4721         .vop_read               = xfs_read,
4722 #ifdef HAVE_SPLICE
4723         .vop_splice_read        = xfs_splice_read,
4724         .vop_splice_write       = xfs_splice_write,
4725 #endif
4726         .vop_write              = xfs_write,
4727         .vop_ioctl              = xfs_ioctl,
4728         .vop_getattr            = xfs_getattr,
4729         .vop_setattr            = xfs_setattr,
4730         .vop_access             = xfs_access,
4731         .vop_lookup             = xfs_lookup,
4732         .vop_create             = xfs_create,
4733         .vop_remove             = xfs_remove,
4734         .vop_link               = xfs_link,
4735         .vop_rename             = xfs_rename,
4736         .vop_mkdir              = xfs_mkdir,
4737         .vop_rmdir              = xfs_rmdir,
4738         .vop_readdir            = xfs_readdir,
4739         .vop_symlink            = xfs_symlink,
4740         .vop_readlink           = xfs_readlink,
4741         .vop_fsync              = xfs_fsync,
4742         .vop_inactive           = xfs_inactive,
4743         .vop_fid2               = xfs_fid2,
4744         .vop_rwlock             = xfs_rwlock,
4745         .vop_rwunlock           = xfs_rwunlock,
4746         .vop_bmap               = xfs_bmap,
4747         .vop_reclaim            = xfs_reclaim,
4748         .vop_attr_get           = xfs_attr_get,
4749         .vop_attr_set           = xfs_attr_set,
4750         .vop_attr_remove        = xfs_attr_remove,
4751         .vop_attr_list          = xfs_attr_list,
4752         .vop_link_removed       = (vop_link_removed_t)fs_noval,
4753         .vop_vnode_change       = (vop_vnode_change_t)fs_noval,
4754         .vop_tosspages          = fs_tosspages,
4755         .vop_flushinval_pages   = fs_flushinval_pages,
4756         .vop_flush_pages        = fs_flush_pages,
4757         .vop_release            = xfs_release,
4758         .vop_iflush             = xfs_inode_flush,
4759 };