coredump: fix unfreezable coredumping task

[pandora-kernel.git] / fs / exec.c
diff --git a/fs/exec.c b/fs/exec.c

index 3625464..3f8d8f3 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -25,6 +25,7 @@
  #include <linux/slab.h>
  #include <linux/file.h>
  #include <linux/fdtable.h>
+#include <linux/freezer.h>
  #include <linux/mm.h>
  #include <linux/stat.h>
  #include <linux/fcntl.h>
@@ -55,6 +56,9 @@
  #include <linux/pipe_fs_i.h>
  #include <linux/oom.h>
  #include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
  
  #include <asm/uaccess.h>
  #include <asm/mmu_context.h>
@@ -973,6 +977,9 @@ static int de_thread(struct task_struct *tsk)
         sig->notify_count = 0;
  
  no_thread_group:
+       /* we have changed execution domain */
+       tsk->exit_signal = SIGCHLD;
+
         if (current->mm)
                 setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
  
@@ -1092,7 +1099,8 @@ int flush_old_exec(struct linux_binprm * bprm)
         bprm->mm = NULL;                /* We're using it now */
  
         set_fs(USER_DS);
-       current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
+       current->flags &=
+               ~(PF_RANDOMIZE | PF_KTHREAD | PF_NOFREEZE | PF_FREEZER_NOSIG);
         flush_thread();
         current->personality &= ~bprm->per_clear;
  
@@ -1155,13 +1163,6 @@ void setup_new_exec(struct linux_binprm * bprm)
                         set_dumpable(current->mm, suid_dumpable);
         }
  
-       /*
-        * Flush performance counters when crossing a
-        * security domain:
-        */
-       if (!get_dumpable(current->mm))
-               perf_event_exit_task(current);
-
         /* An exec changes our domain. We are no longer part of the thread
            group */
  
@@ -1198,9 +1199,24 @@ void free_bprm(struct linux_binprm *bprm)
                 mutex_unlock(&current->signal->cred_guard_mutex);
                 abort_creds(bprm->cred);
         }
+       /* If a binfmt changed the interp, free it. */
+       if (bprm->interp != bprm->filename)
+               kfree(bprm->interp);
         kfree(bprm);
  }
  
+int bprm_change_interp(char *interp, struct linux_binprm *bprm)
+{
+       /* If a binfmt changed the interp, free it first. */
+       if (bprm->interp != bprm->filename)
+               kfree(bprm->interp);
+       bprm->interp = kstrdup(interp, GFP_KERNEL);
+       if (!bprm->interp)
+               return -ENOMEM;
+       return 0;
+}
+EXPORT_SYMBOL(bprm_change_interp);
+
  /*
   * install the new credentials for this executable
   */
@@ -1210,6 +1226,15 @@ void install_exec_creds(struct linux_binprm *bprm)
  
         commit_creds(bprm->cred);
         bprm->cred = NULL;
+
+       /*
+        * Disable monitoring for regular users
+        * when executing setuid binaries. Must
+        * wait until new credentials are committed
+        * by commit_creds() above
+        */
+       if (get_dumpable(current->mm) != SUID_DUMP_USER)
+               perf_event_exit_task(current);
         /*
          * cred_guard_mutex must be held at least to this point to prevent
          * ptrace_attach() from altering our determination of the task's
@@ -1261,6 +1286,45 @@ int check_unsafe_exec(struct linux_binprm *bprm)
         return res;
  }
  
+static void bprm_fill_uid(struct linux_binprm *bprm)
+{
+       struct inode *inode;
+       unsigned int mode;
+       uid_t uid;
+       gid_t gid;
+
+       /* clear any previous set[ug]id data from a previous binary */
+       bprm->cred->euid = current_euid();
+       bprm->cred->egid = current_egid();
+
+       if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+               return;
+
+       inode = bprm->file->f_path.dentry->d_inode;
+       mode = ACCESS_ONCE(inode->i_mode);
+       if (!(mode & (S_ISUID|S_ISGID)))
+               return;
+
+       /* Be careful if suid/sgid is set */
+       mutex_lock(&inode->i_mutex);
+
+       /* reload atomically mode/uid/gid now that lock held */
+       mode = inode->i_mode;
+       uid = inode->i_uid;
+       gid = inode->i_gid;
+       mutex_unlock(&inode->i_mutex);
+
+       if (mode & S_ISUID) {
+               bprm->per_clear |= PER_CLEAR_ON_SETID;
+               bprm->cred->euid = uid;
+       }
+
+       if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+               bprm->per_clear |= PER_CLEAR_ON_SETID;
+               bprm->cred->egid = gid;
+       }
+}
+
  /* 
   * Fill the binprm structure from the inode. 
   * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
@@ -1269,36 +1333,12 @@ int check_unsafe_exec(struct linux_binprm *bprm)
   */
  int prepare_binprm(struct linux_binprm *bprm)
  {
-       umode_t mode;
-       struct inode * inode = bprm->file->f_path.dentry->d_inode;
         int retval;
  
-       mode = inode->i_mode;
         if (bprm->file->f_op == NULL)
                 return -EACCES;
  
-       /* clear any previous set[ug]id data from a previous binary */
-       bprm->cred->euid = current_euid();
-       bprm->cred->egid = current_egid();
-
-       if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
-               /* Set-uid? */
-               if (mode & S_ISUID) {
-                       bprm->per_clear |= PER_CLEAR_ON_SETID;
-                       bprm->cred->euid = inode->i_uid;
-               }
-
-               /* Set-gid? */
-               /*
-                * If setgid is set but no group execute bit then this
-                * is a candidate for mandatory locking, not a setgid
-                * executable.
-                */
-               if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-                       bprm->per_clear |= PER_CLEAR_ON_SETID;
-                       bprm->cred->egid = inode->i_gid;
-               }
-       }
+       bprm_fill_uid(bprm);
  
         /* fill in binprm security blob */
         retval = security_bprm_set_creds(bprm);
@@ -1366,6 +1406,10 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
         struct linux_binfmt *fmt;
         pid_t old_pid;
  
+       /* This allows 4 levels of binfmt rewrites before failing hard. */
+       if (depth > 5)
+               return -ELOOP;
+
         retval = security_bprm_check(bprm);
         if (retval)
                 return retval;
@@ -1389,12 +1433,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
                         if (!try_module_get(fmt->module))
                                 continue;
                         read_unlock(&binfmt_lock);
+                       bprm->recursion_depth = depth + 1;
                         retval = fn(bprm, regs);
-                       /*
-                        * Restore the depth counter to its starting value
-                        * in this call, so we don't have to rely on every
-                        * load_binary function to restore it on return.
-                        */
                         bprm->recursion_depth = depth;
                         if (retval >= 0) {
                                 if (depth == 0)
@@ -1935,8 +1975,11 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
                 complete(vfork_done);
         }
  
-       if (core_waiters)
+       if (core_waiters > 0) {
+               freezer_do_not_count();
                 wait_for_completion(&core_state->startup);
+               freezer_count();
+       }
  fail:
         return core_waiters;
  }
@@ -2011,6 +2054,12 @@ static int __get_dumpable(unsigned long mm_flags)
         return (ret >= 2) ? 2 : ret;
  }
  
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
  int get_dumpable(struct mm_struct *mm)
  {
         return __get_dumpable(mm->flags);
@@ -2092,8 +2141,9 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         const struct cred *old_cred;
         struct cred *cred;
         int retval = 0;
-       int flag = 0;
         int ispipe;
+       /* require nonrelative corefile path and be extra careful */
+       bool need_suid_safe = false;
         static atomic_t core_dump_count = ATOMIC_INIT(0);
         struct coredump_params cprm = {
                 .signr = signr,
@@ -2119,14 +2169,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         if (!cred)
                 goto fail;
         /*
-        *      We cannot trust fsuid as being the "true" uid of the
-        *      process nor do we know its entire history. We only know it
-        *      was tainted so we dump it as root in mode 2.
+        * We cannot trust fsuid as being the "true" uid of the process
+        * nor do we know its entire history. We only know it was tainted
+        * so we dump it as root in mode 2, and only into a controlled
+        * environment (pipe handler or fully qualified path).
          */
         if (__get_dumpable(cprm.mm_flags) == 2) {
                 /* Setuid core dump mode */
-               flag = O_EXCL;          /* Stop rewrite attacks */
                 cred->fsuid = 0;        /* Dump root private */
+               need_suid_safe = true;
         }
  
         retval = coredump_wait(exit_code, &core_state);
@@ -2202,13 +2253,67 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                 }
         } else {
                 struct inode *inode;
+               int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+                                O_LARGEFILE | O_EXCL;
  
                 if (cprm.limit < binfmt->min_coredump)
                         goto fail_unlock;
  
-               cprm.file = filp_open(cn.corename,
-                                O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
-                                0600);
+               if (need_suid_safe && cn.corename[0] != '/') {
+                       printk(KERN_WARNING "Pid %d(%s) can only dump core "\
+                               "to fully qualified path!\n",
+                               task_tgid_vnr(current), current->comm);
+                       printk(KERN_WARNING "Skipping core dump\n");
+                       goto fail_unlock;
+               }
+
+               /*
+                * Unlink the file if it exists unless this is a SUID
+                * binary - in that case, we're running around with root
+                * privs and don't want to unlink another user's coredump.
+                */
+               if (!need_suid_safe) {
+                       mm_segment_t old_fs;
+
+                       old_fs = get_fs();
+                       set_fs(KERNEL_DS);
+                       /*
+                        * If it doesn't exist, that's fine. If there's some
+                        * other problem, we'll catch it at the filp_open().
+                        */
+                       (void) sys_unlink((const char __user *)cn.corename);
+                       set_fs(old_fs);
+               }
+
+               /*
+                * There is a race between unlinking and creating the
+                * file, but if that causes an EEXIST here, that's
+                * fine - another process raced with us while creating
+                * the corefile, and the other process won. To userspace,
+                * what matters is that at least one of the two processes
+                * writes its coredump successfully, not which one.
+                */
+               if (need_suid_safe) {
+                       /*
+                        * Using user namespaces, normal user tasks can change
+                        * their current->fs->root to point to arbitrary
+                        * directories. Since the intention of the "only dump
+                        * with a fully qualified path" rule is to control where
+                        * coredumps may be placed using root privileges,
+                        * current->fs->root must not be used. Instead, use the
+                        * root directory of init_task.
+                        */
+                       struct path root;
+
+                       task_lock(&init_task);
+                       get_fs_root(init_task.fs, &root);
+                       task_unlock(&init_task);
+                       cprm.file = file_open_root(root.dentry, root.mnt,
+                               cn.corename, open_flags, 0600);
+                       path_put(&root);
+               } else {
+                       cprm.file = filp_open(cn.corename, open_flags, 0600);
+               }
                 if (IS_ERR(cprm.file))
                         goto fail_unlock;