Unverified Commit 24018929 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "acct: don't allow access to internal filesystems"

Christian Brauner <brauner@kernel.org> says:

In [1] it was reported that the acct(2) system call can be used to
trigger a NULL deref in cases where it is set to write to a file that
triggers an internal lookup.

This can e.g., happen when pointing acct(2) to /sys/power/resume. At the
point the where the write to this file happens the calling task has
already exited and called exit_fs() but an internal lookup might be
triggered through lookup_bdev(). This may trigger a NULL-deref
when accessing current->fs.

This series does two things:

- Reorganize the code so that the the final write happens from the
  workqueue but with the caller's credentials. This preserves the
  (strange) permission model and has almost no regression risk.

- Block access to kernel internal filesystems as well as procfs and
  sysfs in the first place.

This api should stop to exist imho.

Link: https://lore.kernel.org/r/20250127091811.3183623-1-quzicheng@huawei.com [1]

* patches from https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org:
  acct: block access to kernel internal filesystems
  acct: perform last write from workqueue

Link: https://lore.kernel.org/r/20250211-work-acct-v1-0-1c16aecab8b3@kernel.org


Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents a64dcfb4 890ed45b
Loading
Loading
Loading
Loading
+84 −50
Original line number Diff line number Diff line
@@ -103,48 +103,50 @@ struct bsd_acct_struct {
	atomic_long_t		count;
	struct rcu_head		rcu;
	struct mutex		lock;
	int			active;
	bool			active;
	bool			check_space;
	unsigned long		needcheck;
	struct file		*file;
	struct pid_namespace	*ns;
	struct work_struct	work;
	struct completion	done;
	acct_t			ac;
};

static void do_acct_process(struct bsd_acct_struct *acct);
static void fill_ac(struct bsd_acct_struct *acct);
static void acct_write_process(struct bsd_acct_struct *acct);

/*
 * Check the amount of free space and suspend/resume accordingly.
 */
static int check_free_space(struct bsd_acct_struct *acct)
static bool check_free_space(struct bsd_acct_struct *acct)
{
	struct kstatfs sbuf;

	if (time_is_after_jiffies(acct->needcheck))
		goto out;
	if (!acct->check_space)
		return acct->active;

	/* May block */
	if (vfs_statfs(&acct->file->f_path, &sbuf))
		goto out;
		return acct->active;

	if (acct->active) {
		u64 suspend = sbuf.f_blocks * SUSPEND;
		do_div(suspend, 100);
		if (sbuf.f_bavail <= suspend) {
			acct->active = 0;
			acct->active = false;
			pr_info("Process accounting paused\n");
		}
	} else {
		u64 resume = sbuf.f_blocks * RESUME;
		do_div(resume, 100);
		if (sbuf.f_bavail >= resume) {
			acct->active = 1;
			acct->active = true;
			pr_info("Process accounting resumed\n");
		}
	}

	acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
out:
	return acct->active;
}

@@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
	struct bsd_acct_struct *acct = to_acct(pin);
	mutex_lock(&acct->lock);
	do_acct_process(acct);
	/*
	 * Fill the accounting struct with the exiting task's info
	 * before punting to the workqueue.
	 */
	fill_ac(acct);
	schedule_work(&acct->work);
	wait_for_completion(&acct->done);
	cmpxchg(&acct->ns->bacct, pin, NULL);
@@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
{
	struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
	struct file *file = acct->file;

	/* We were fired by acct_pin_kill() which holds acct->lock. */
	acct_write_process(acct);
	if (file->f_op->flush)
		file->f_op->flush(file, NULL);
	__fput_sync(file);
@@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
		return -EACCES;
	}

	/* Exclude kernel kernel internal filesystems. */
	if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
		kfree(acct);
		filp_close(file, NULL);
		return -EINVAL;
	}

	/* Exclude procfs and sysfs. */
	if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
		kfree(acct);
		filp_close(file, NULL);
		return -EINVAL;
	}

	if (!(file->f_mode & FMODE_CAN_WRITE)) {
		kfree(acct);
		filp_close(file, NULL);
@@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
 *  do_exit() or when switching to a different output file.
 */

static void fill_ac(acct_t *ac)
static void fill_ac(struct bsd_acct_struct *acct)
{
	struct pacct_struct *pacct = &current->signal->pacct;
	struct file *file = acct->file;
	acct_t *ac = &acct->ac;
	u64 elapsed, run_time;
	time64_t btime;
	struct tty_struct *tty;

	lockdep_assert_held(&acct->lock);

	if (time_is_after_jiffies(acct->needcheck)) {
		acct->check_space = false;

		/* Don't fill in @ac if nothing will be written. */
		if (!acct->active)
			return;
	} else {
		acct->check_space = true;
	}

	/*
	 * Fill the accounting struct with the needed info as recorded
	 * by the different kernel functions.
@@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
	ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
	ac->ac_exitcode = pacct->ac_exitcode;
	spin_unlock_irq(&current->sighand->siglock);
}
/*
 *  do_acct_process does all actual work. Caller holds the reference to file.
 */
static void do_acct_process(struct bsd_acct_struct *acct)
{
	acct_t ac;
	unsigned long flim;
	const struct cred *orig_cred;
	struct file *file = acct->file;

	/*
	 * Accounting records are not subject to resource limits.
	 */
	flim = rlimit(RLIMIT_FSIZE);
	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
	/* Perform file operations on behalf of whoever enabled accounting */
	orig_cred = override_creds(file->f_cred);

	/*
	 * First check to see if there is enough free_space to continue
	 * the process accounting system.
	 */
	if (!check_free_space(acct))
		goto out;

	fill_ac(&ac);
	/* we really need to bite the bullet and change layout */
	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
	ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
	ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
	/* backward-compatible 16 bit fields */
	ac.ac_uid16 = ac.ac_uid;
	ac.ac_gid16 = ac.ac_gid;
	ac->ac_uid16 = ac->ac_uid;
	ac->ac_gid16 = ac->ac_gid;
#elif ACCT_VERSION == 3
	{
		struct pid_namespace *ns = acct->ns;

		ac.ac_pid = task_tgid_nr_ns(current, ns);
		ac->ac_pid = task_tgid_nr_ns(current, ns);
		rcu_read_lock();
		ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
					     ns);
		ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
		rcu_read_unlock();
	}
#endif
}

static void acct_write_process(struct bsd_acct_struct *acct)
{
	struct file *file = acct->file;
	const struct cred *cred;
	acct_t *ac = &acct->ac;

	/* Perform file operations on behalf of whoever enabled accounting */
	cred = override_creds(file->f_cred);

	/*
	 * Get freeze protection. If the fs is frozen, just skip the write
	 * as we could deadlock the system otherwise.
	 * First check to see if there is enough free_space to continue
	 * the process accounting system. Then get freeze protection. If
	 * the fs is frozen, just skip the write as we could deadlock
	 * the system otherwise.
	 */
	if (file_start_write_trylock(file)) {
	if (check_free_space(acct) && file_start_write_trylock(file)) {
		/* it's been opened O_APPEND, so position is irrelevant */
		loff_t pos = 0;
		__kernel_write(file, &ac, sizeof(acct_t), &pos);
		__kernel_write(file, ac, sizeof(acct_t), &pos);
		file_end_write(file);
	}
out:

	revert_creds(cred);
}

static void do_acct_process(struct bsd_acct_struct *acct)
{
	unsigned long flim;

	/* Accounting records are not subject to resource limits. */
	flim = rlimit(RLIMIT_FSIZE);
	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
	fill_ac(acct);
	acct_write_process(acct);
	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
	revert_creds(orig_cred);
}

/**