X-Git-Url: https://git.openpandora.org/cgi-bin/gitweb.cgi?p=pandora-kernel.git;a=blobdiff_plain;f=mm%2Fshmem.c;h=cc956cfdd3423fb79544ff0c3acaabbfc1cd1d7c;hp=a6f0e981c6d8eca888a5b037eafbda61f63e3f0c;hb=6ed432ff1bc0a1ef1f3f736038c687cb13d1f62b;hpb=eb497bef403c18e1e4ef1906e1f75010b325d87a diff --git a/mm/shmem.c b/mm/shmem.c index a6f0e981c6d8..cc956cfdd342 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -63,6 +63,9 @@ static struct vfsmount *shm_mnt; #include #include #include +#include +#include +#include #include #include @@ -76,6 +79,17 @@ static struct vfsmount *shm_mnt; /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 +/* + * vmtruncate_range() communicates with shmem_fault via + * inode->i_private (with i_mutex making sure that it has only one user at + * a time): we would prefer not to enlarge the shmem inode just for that. + */ +struct shmem_falloc { + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ + pgoff_t start; /* start of range currently being fallocated */ + pgoff_t next; /* the next page offset to be fallocated */ +}; + struct shmem_xattr { struct list_head list; /* anchored by shmem_inode_info->xattr_list */ char *name; /* xattr name */ @@ -488,22 +502,19 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } index = start; - for ( ; ; ) { + while (index <= end) { cond_resched(); pvec.nr = shmem_find_get_pages_and_swap(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, pvec.pages, indices); if (!pvec.nr) { - if (index == start) + /* If all gone or hole-punch, we're done */ + if (index == start || end != -1) break; + /* But if truncating, restart to make sure all gone */ index = start; continue; } - if (index == start && indices[0] > end) { - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - break; - } mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -513,8 +524,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) break; if (radix_tree_exceptional_entry(page)) { - nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + if (shmem_free_swap(mapping, index, page)) { + /* Swap was replaced by page: retry */ + index--; + break; + } + nr_swaps_freed++; continue; } @@ -522,6 +537,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) if (page->mapping == mapping) { VM_BUG_ON(PageWriteback(page)); truncate_inode_page(mapping, page); + } else { + /* Page was replaced by swap: retry */ + unlock_page(page); + index--; + break; } unlock_page(page); } @@ -543,9 +563,10 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); int error; - error = inode_change_ok(inode, attr); + error = setattr_prepare(dentry, attr); if (error) return error; @@ -553,6 +574,11 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + if (newsize != oldsize) { i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -595,7 +621,7 @@ static void shmem_evict_inode(struct inode *inode) kfree(xattr->name); kfree(xattr); } - BUG_ON(inode->i_blocks); + WARN_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); end_writeback(inode); } @@ -798,24 +824,28 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct mempolicy mpol, *spol; struct vm_area_struct pvma; - - spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, index)); + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; pvma.vm_pgoff = index; pvma.vm_ops = NULL; - pvma.vm_policy = spol; - return swapin_readahead(swap, gfp, &pvma, 0); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + + page = swapin_readahead(swap, gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; @@ -823,10 +853,12 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - /* - * alloc_page_vma() will drop the shared policy reference - */ - return alloc_page_vma(gfp, &pvma, 0); + page = alloc_page_vma(gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS @@ -1054,6 +1086,63 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; int ret = VM_FAULT_LOCKED; + /* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_mutex. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_truncate_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_mutex in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ + if (unlikely(inode->i_private)) { + struct shmem_falloc *shmem_falloc; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT(shmem_fault_wait); + + ret = VM_FAULT_NOPAGE; + if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && + !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* It's polite to up mmap_sem if we can */ + up_read(&vma->vm_mm->mmap_sem); + ret = VM_FAULT_RETRY; + } + + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the vmtruncate_range() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + spin_unlock(&inode->i_lock); + return ret; + } + spin_unlock(&inode->i_lock); + } + error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1065,6 +1154,48 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + * Only CONFIG_SHMEM shmem.c ever supported i_op->truncate_range(). + */ + if (inode->i_op->truncate_range != shmem_truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + { + struct shmem_falloc shmem_falloc; + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + + shmem_falloc.waitq = &shmem_falloc_waitq; + shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; + spin_lock(&inode->i_lock); + inode->i_private = &shmem_falloc; + spin_unlock(&inode->i_lock); + + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + shmem_truncate_range(inode, lstart, lend); + /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + spin_unlock(&inode->i_lock); + } + mutex_unlock(&inode->i_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(vmtruncate_range); + #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { @@ -1137,6 +1268,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); INIT_LIST_HEAD(&info->xattr_list); @@ -1184,7 +1316,17 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; + struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + /* i_mutex is held by caller */ + if (unlikely(info->seals)) { + if (info->seals & F_SEAL_WRITE) + return -EPERM; + if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) + return -EPERM; + } + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1457,6 +1599,119 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } +static int shmem_wait_for_pins(struct address_space *mapping) +{ + return 0; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +int shmem_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a shmem-file but restrict + * access to a specific subset of file operations. Seals can only be + * added, but never removed. This way, mutually untrusted parties can + * share common memory regions with a well-defined policy. A malicious + * peer can thus never perform unwanted operations on a shared object. + * + * Seals are only supported on special shmem-files and always affect + * the whole underlying inode. Once a seal is set, it may prevent some + * kinds of access to the file. Currently, the following seals are + * defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous shmem files support sealing. More importantly, seals are + * never written to disk. Therefore, there's no plan to support it on + * other file types. + */ + + if (file->f_op != &shmem_file_operations) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + mutex_lock(&inode->i_mutex); + + if (info->seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = shmem_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + info->seals |= seals; + error = 0; + +unlock: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(shmem_add_seals); + +int shmem_get_seals(struct file *file) +{ + if (file->f_op != &shmem_file_operations) + return -EINVAL; + + return SHMEM_I(file_inode(file))->seals; +} +EXPORT_SYMBOL_GPL(shmem_get_seals); + +long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = shmem_add_seals(file, arg); + break; + case F_GET_SEALS: + error = shmem_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -1598,8 +1853,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); - if (they_are_dirs) + if (they_are_dirs) { + drop_nlink(new_dentry->d_inode); drop_nlink(old_dir); + } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); @@ -1962,12 +2219,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, { struct inode *inode; struct dentry *dentry = NULL; - u64 inum = fid->raw[2]; - inum = (inum << 32) | fid->raw[1]; + u64 inum; if (fh_len < 3) return NULL; + inum = fid->raw[2]; + inum = (inum << 32) | fid->raw[1]; + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { @@ -2113,6 +2372,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) unsigned long inodes; int error = -EINVAL; + config.mpol = NULL; if (shmem_parse_options(data, &config, true)) return error; @@ -2137,8 +2397,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; - mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + /* + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (config.mpol) { + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ + } out: spin_unlock(&sbinfo->stat_lock); return error; @@ -2162,6 +2427,77 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) shmem_show_mpol(seq, sbinfo->mpol); return 0; } + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + struct shmem_inode_info *info; + struct file *file; + int fd, error; + char *name; + long len; + + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + info = SHMEM_I(file_inode(file)); + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + if (flags & MFD_ALLOW_SEALING) + info->seals &= ~F_SEAL_SEAL; + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} + #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) @@ -2482,6 +2818,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + /* Only CONFIG_SHMEM shmem.c ever supported i_op->truncate_range(). */ + return -ENOSYS; +} + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)