Commit b71f0be2 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull cgroup updates from Tejun Heo:

 - cgroup_file_notify() locking converted from a global lock to
   per-cgroup_file spinlock with a lockless fast-path when no
   notification is needed

 - Misc changes including exposing cgroup helpers for sched_ext and
   minor fixes

* tag 'cgroup-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup/rdma: fix swapped arguments in pr_warn() format string
  cgroup/dmem: remove region parameter from dmemcg_parse_limit
  cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock
  cgroup: add lockless fast-path checks to cgroup_file_notify()
  cgroup: reduce cgroup_file_kn_lock hold time in cgroup_file_notify()
  cgroup: Expose some cgroup helpers
parents 05cef13f 3348e1e8
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -167,6 +167,7 @@ struct cgroup_file {
	struct kernfs_node *kn;
	unsigned long notified_at;
	struct timer_list notify_timer;
	spinlock_t lock;
};

/*
+63 −2
Original line number Diff line number Diff line
@@ -42,6 +42,14 @@ struct kernel_clone_args;

#ifdef CONFIG_CGROUPS

/*
 * To avoid confusing the compiler (and generating warnings) with code
 * that attempts to access what would be a 0-element array (i.e. sized
 * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
 * constant expression can be added.
 */
#define CGROUP_HAS_SUBSYS_CONFIG	(CGROUP_SUBSYS_COUNT > 0)

enum css_task_iter_flags {
	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
@@ -76,6 +84,7 @@ enum cgroup_lifetime_events {
extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct blocking_notifier_head cgroup_lifetime_notifier;

@@ -103,6 +112,8 @@ extern struct blocking_notifier_head cgroup_lifetime_notifier;
#define cgroup_subsys_on_dfl(ss)						\
	static_branch_likely(&ss ## _on_dfl_key)

bool cgroup_on_dfl(const struct cgroup *cgrp);

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
@@ -274,6 +285,32 @@ void css_task_iter_end(struct css_task_iter *it);
	for ((pos) = css_next_descendant_post(NULL, (css)); (pos);	\
	     (pos) = css_next_descendant_post((pos), (css)))

/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)				\
	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       cgroup_is_dead(child); }))			\
			;						\
		else

/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
@@ -336,6 +373,27 @@ static inline u64 cgroup_id(const struct cgroup *cgrp)
	return cgrp->kn->id;
}

/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
 */
static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
						     struct cgroup_subsys *ss)
{
	if (CGROUP_HAS_SUBSYS_CONFIG && ss)
		return rcu_dereference_check(cgrp->subsys[ss->id],
					lockdep_is_held(&cgroup_mutex));
	else
		return &cgrp->self;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
@@ -372,6 +430,11 @@ static inline bool css_is_self(struct cgroup_subsys_state *css)
	return false;
}

static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
	return !(cgrp->self.flags & CSS_ONLINE);
}

static inline void cgroup_get(struct cgroup *cgrp)
{
	css_get(&cgrp->self);
@@ -387,8 +450,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
	css_put(&cgrp->self);
}

extern struct mutex cgroup_mutex;

static inline void cgroup_lock(void)
{
	mutex_lock(&cgroup_mutex);
+0 −6
Original line number Diff line number Diff line
@@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible;
	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

static inline bool cgroup_is_dead(const struct cgroup *cgrp)
{
	return !(cgrp->self.flags & CSS_ONLINE);
}

static inline bool notify_on_release(const struct cgroup *cgrp)
{
	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset)
}

bool cgroup_ssid_enabled(int ssid);
bool cgroup_on_dfl(const struct cgroup *cgrp);

struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
struct cgroup *task_cgroup_from_root(struct task_struct *task,
+28 −80
Original line number Diff line number Diff line
@@ -68,14 +68,6 @@
/* let's not notify more than 100 times per second */
#define CGROUP_FILE_NOTIFY_MIN_INTV	DIV_ROUND_UP(HZ, 100)

/*
 * To avoid confusing the compiler (and generating warnings) with code
 * that attempts to access what would be a 0-element array (i.e. sized
 * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
 * constant expression can be added.
 */
#define CGROUP_HAS_SUBSYS_CONFIG	(CGROUP_SUBSYS_COUNT > 0)

/*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
@@ -107,12 +99,6 @@ static bool cgroup_debug __read_mostly;
 */
static DEFINE_SPINLOCK(cgroup_idr_lock);

/*
 * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 * against file removal/re-creation across css hiding.
 */
static DEFINE_SPINLOCK(cgroup_file_kn_lock);

DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);

#define cgroup_assert_mutex_or_rcu_locked()				\
@@ -509,27 +495,6 @@ static u32 cgroup_ss_mask(struct cgroup *cgrp)
	return cgrp->root->subsys_mask;
}

/**
 * cgroup_css - obtain a cgroup's css for the specified subsystem
 * @cgrp: the cgroup of interest
 * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 *
 * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 * function must be called either under cgroup_mutex or rcu_read_lock() and
 * the caller is responsible for pinning the returned css if it wants to
 * keep accessing it outside the said locks.  This function may return
 * %NULL if @cgrp doesn't have @subsys_id enabled.
 */
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
					      struct cgroup_subsys *ss)
{
	if (CGROUP_HAS_SUBSYS_CONFIG && ss)
		return rcu_dereference_check(cgrp->subsys[ss->id],
					lockdep_is_held(&cgroup_mutex));
	else
		return &cgrp->self;
}

/**
 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
 * @cgrp: the cgroup of interest
@@ -741,32 +706,6 @@ EXPORT_SYMBOL_GPL(of_css);
	}								\
} while (false)

/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp)				\
	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       cgroup_is_dead(child); }))			\
			;						\
		else

/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)		\
	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)		\
	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL))	\
		if (({ lockdep_assert_held(&cgroup_mutex);		\
		       (dsct) = (d_css)->cgroup;			\
		       cgroup_is_dead(dsct); }))			\
			;						\
		else

/*
 * The default css_set - used by init and its children prior to any
 * hierarchies being mounted. It contains a pointer to the root state
@@ -1748,9 +1687,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
		struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

		spin_lock_irq(&cgroup_file_kn_lock);
		cfile->kn = NULL;
		spin_unlock_irq(&cgroup_file_kn_lock);
		spin_lock_irq(&cfile->lock);
		WRITE_ONCE(cfile->kn, NULL);
		spin_unlock_irq(&cfile->lock);

		timer_delete_sync(&cfile->notify_timer);
	}
@@ -4429,10 +4368,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
		struct cgroup_file *cfile = (void *)css + cft->file_offset;

		timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);

		spin_lock_irq(&cgroup_file_kn_lock);
		spin_lock_init(&cfile->lock);
		cfile->kn = kn;
		spin_unlock_irq(&cgroup_file_kn_lock);
	}

	return 0;
@@ -4687,21 +4624,32 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 */
void cgroup_file_notify(struct cgroup_file *cfile)
{
	unsigned long flags;
	unsigned long flags, last, next;
	struct kernfs_node *kn = NULL;

	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
	if (cfile->kn) {
		unsigned long last = cfile->notified_at;
		unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
	if (!READ_ONCE(cfile->kn))
		return;

	last = READ_ONCE(cfile->notified_at);
	next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
	if (time_in_range(jiffies, last, next)) {
		timer_reduce(&cfile->notify_timer, next);
		} else {
			kernfs_notify(cfile->kn);
			cfile->notified_at = jiffies;
		if (timer_pending(&cfile->notify_timer))
			return;
	}

	spin_lock_irqsave(&cfile->lock, flags);
	if (cfile->kn) {
		kn = cfile->kn;
		kernfs_get(kn);
		WRITE_ONCE(cfile->notified_at, jiffies);
	}
	spin_unlock_irqrestore(&cfile->lock, flags);

	if (kn) {
		kernfs_notify(kn);
		kernfs_put(kn);
	}
	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
}
EXPORT_SYMBOL_GPL(cgroup_file_notify);

@@ -4714,10 +4662,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show)
{
	struct kernfs_node *kn;

	spin_lock_irq(&cgroup_file_kn_lock);
	spin_lock_irq(&cfile->lock);
	kn = cfile->kn;
	kernfs_get(kn);
	spin_unlock_irq(&cgroup_file_kn_lock);
	spin_unlock_irq(&cfile->lock);

	if (kn)
		kernfs_show(kn, show);
+2 −3
Original line number Diff line number Diff line
@@ -707,8 +707,7 @@ static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
	return 0;
}

static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
			      u64 *new_limit)
static int dmemcg_parse_limit(char *options, u64 *new_limit)
{
	char *end;

@@ -762,7 +761,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
		if (!region)
			return -EINVAL;

		err = dmemcg_parse_limit(options, region, &new_limit);
		err = dmemcg_parse_limit(options, &new_limit);
		if (err < 0)
			goto out_put;

Loading