mm: memcg: move legacy memcg event code into memcontrol-v1.c (66d60c42) · Commits · git / linux-nf

include/linux/memcontrol.h

+0 −12

Original line number	Diff line number	Diff line
		@@ -69,18 +69,6 @@ struct mem_cgroup_id {
		refcount_t ref;
		};

		/*
		* Per memcg event counter is incremented at every pagein/pageout. With THP,
		* it will be incremented by the number of pages. This counter is used
		* to trigger some periodic events. This is straightforward and better
		* than using jiffies etc. to handle periodic memcg event.
		*/
		enum mem_cgroup_events_target {
		MEM_CGROUP_TARGET_THRESH,
		MEM_CGROUP_TARGET_SOFTLIMIT,
		MEM_CGROUP_NTARGETS,
		};

		struct memcg_vmstats_percpu;
		struct memcg_vmstats;
		struct lruvec_stats_percpu;

mm/memcontrol-v1.c

+653 −0

Original line number	Diff line number	Diff line
		@@ -6,6 +6,10 @@
		#include <linux/pagewalk.h>
		#include <linux/backing-dev.h>
		#include <linux/swap_cgroup.h>
		#include <linux/eventfd.h>
		#include <linux/poll.h>
		#include <linux/sort.h>
		#include <linux/file.h>

		#include "internal.h"
		#include "swap.h"
		@@ -60,6 +64,54 @@ static struct move_charge_struct {
		.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
		};

		/* for OOM */
		struct mem_cgroup_eventfd_list {
		struct list_head list;
		struct eventfd_ctx *eventfd;
		};

		/*
		* cgroup_event represents events which userspace want to receive.
		*/
		struct mem_cgroup_event {
		/*
		* memcg which the event belongs to.
		*/
		struct mem_cgroup *memcg;
		/*
		* eventfd to signal userspace about the event.
		*/
		struct eventfd_ctx *eventfd;
		/*
		* Each of these stored in a list by the cgroup.
		*/
		struct list_head list;
		/*
		* register_event() callback will be used to add new userspace
		* waiter for changes related to this event. Use eventfd_signal()
		* on eventfd to send notification to userspace.
		*/
		int (register_event)(struct mem_cgroup memcg,
		struct eventfd_ctx eventfd, const char args);
		/*
		* unregister_event() callback will be called when userspace closes
		* the eventfd or on cgroup removing. This callback must be set,
		* if you want provide notification functionality.
		*/
		void (unregister_event)(struct mem_cgroup memcg,
		struct eventfd_ctx *eventfd);
		/*
		* All fields below needed to unregister event when
		* userspace closes eventfd.
		*/
		poll_table pt;
		wait_queue_head_t *wqh;
		wait_queue_entry_t wait;
		struct work_struct remove;
		};

		extern spinlock_t memcg_oom_lock;

		static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
		struct mem_cgroup_tree_per_node *mctz,
		unsigned long new_usage_in_excess)
		@@ -1306,6 +1358,607 @@ void memcg1_move_task(void)
		}
		#endif

		static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
		{
		struct mem_cgroup_threshold_ary *t;
		unsigned long usage;
		int i;

		rcu_read_lock();
		if (!swap)
		t = rcu_dereference(memcg->thresholds.primary);
		else
		t = rcu_dereference(memcg->memsw_thresholds.primary);

		if (!t)
		goto unlock;

		usage = mem_cgroup_usage(memcg, swap);

		/*
		* current_threshold points to threshold just below or equal to usage.
		* If it's not true, a threshold was crossed after last
		* call of __mem_cgroup_threshold().
		*/
		i = t->current_threshold;

		/*
		* Iterate backward over array of thresholds starting from
		* current_threshold and check if a threshold is crossed.
		* If none of thresholds below usage is crossed, we read
		* only one element of the array here.
		*/
		for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
		eventfd_signal(t->entries[i].eventfd);

		/* i = current_threshold + 1 */
		i++;

		/*
		* Iterate forward over array of thresholds starting from
		* current_threshold+1 and check if a threshold is crossed.
		* If none of thresholds above usage is crossed, we read
		* only one element of the array here.
		*/
		for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
		eventfd_signal(t->entries[i].eventfd);

		/* Update current_threshold */
		t->current_threshold = i - 1;
		unlock:
		rcu_read_unlock();
		}

		static void mem_cgroup_threshold(struct mem_cgroup *memcg)
		{
		while (memcg) {
		__mem_cgroup_threshold(memcg, false);
		if (do_memsw_account())
		__mem_cgroup_threshold(memcg, true);

		memcg = parent_mem_cgroup(memcg);
		}
		}

		/*
		* Check events in order.
		*
		*/
		void memcg_check_events(struct mem_cgroup *memcg, int nid)
		{
		if (IS_ENABLED(CONFIG_PREEMPT_RT))
		return;

		/* threshold event is triggered in finer grain than soft limit */
		if (unlikely(mem_cgroup_event_ratelimit(memcg,
		MEM_CGROUP_TARGET_THRESH))) {
		bool do_softlimit;

		do_softlimit = mem_cgroup_event_ratelimit(memcg,
		MEM_CGROUP_TARGET_SOFTLIMIT);
		mem_cgroup_threshold(memcg);
		if (unlikely(do_softlimit))
		memcg1_update_tree(memcg, nid);
		}
		}

		static int compare_thresholds(const void a, const void b)
		{
		const struct mem_cgroup_threshold *_a = a;
		const struct mem_cgroup_threshold *_b = b;

		if (_a->threshold > _b->threshold)
		return 1;

		if (_a->threshold < _b->threshold)
		return -1;

		return 0;
		}

		static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
		{
		struct mem_cgroup_eventfd_list *ev;

		spin_lock(&memcg_oom_lock);

		list_for_each_entry(ev, &memcg->oom_notify, list)
		eventfd_signal(ev->eventfd);

		spin_unlock(&memcg_oom_lock);
		return 0;
		}

		void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		for_each_mem_cgroup_tree(iter, memcg)
		mem_cgroup_oom_notify_cb(iter);
		}

		static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
		struct eventfd_ctx eventfd, const char args, enum res_type type)
		{
		struct mem_cgroup_thresholds *thresholds;
		struct mem_cgroup_threshold_ary *new;
		unsigned long threshold;
		unsigned long usage;
		int i, size, ret;

		ret = page_counter_memparse(args, "-1", &threshold);
		if (ret)
		return ret;

		mutex_lock(&memcg->thresholds_lock);

		if (type == _MEM) {
		thresholds = &memcg->thresholds;
		usage = mem_cgroup_usage(memcg, false);
		} else if (type == _MEMSWAP) {
		thresholds = &memcg->memsw_thresholds;
		usage = mem_cgroup_usage(memcg, true);
		} else
		BUG();

		/* Check if a threshold crossed before adding a new one */
		if (thresholds->primary)
		__mem_cgroup_threshold(memcg, type == _MEMSWAP);

		size = thresholds->primary ? thresholds->primary->size + 1 : 1;

		/* Allocate memory for new array of thresholds */
		new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
		if (!new) {
		ret = -ENOMEM;
		goto unlock;
		}
		new->size = size;

		/* Copy thresholds (if any) to new array */
		if (thresholds->primary)
		memcpy(new->entries, thresholds->primary->entries,
		flex_array_size(new, entries, size - 1));

		/* Add new threshold */
		new->entries[size - 1].eventfd = eventfd;
		new->entries[size - 1].threshold = threshold;

		/* Sort thresholds. Registering of new threshold isn't time-critical */
		sort(new->entries, size, sizeof(*new->entries),
		compare_thresholds, NULL);

		/* Find current threshold */
		new->current_threshold = -1;
		for (i = 0; i < size; i++) {
		if (new->entries[i].threshold <= usage) {
		/*
		* new->current_threshold will not be used until
		* rcu_assign_pointer(), so it's safe to increment
		* it here.
		*/
		++new->current_threshold;
		} else
		break;
		}

		/* Free old spare buffer and save old primary buffer as spare */
		kfree(thresholds->spare);
		thresholds->spare = thresholds->primary;

		rcu_assign_pointer(thresholds->primary, new);

		/* To be sure that nobody uses thresholds */
		synchronize_rcu();

		unlock:
		mutex_unlock(&memcg->thresholds_lock);

		return ret;
		}

		static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
		struct eventfd_ctx eventfd, const char args)
		{
		return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
		}

		static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
		struct eventfd_ctx eventfd, const char args)
		{
		return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
		}

		static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
		struct eventfd_ctx *eventfd, enum res_type type)
		{
		struct mem_cgroup_thresholds *thresholds;
		struct mem_cgroup_threshold_ary *new;
		unsigned long usage;
		int i, j, size, entries;

		mutex_lock(&memcg->thresholds_lock);

		if (type == _MEM) {
		thresholds = &memcg->thresholds;
		usage = mem_cgroup_usage(memcg, false);
		} else if (type == _MEMSWAP) {
		thresholds = &memcg->memsw_thresholds;
		usage = mem_cgroup_usage(memcg, true);
		} else
		BUG();

		if (!thresholds->primary)
		goto unlock;

		/* Check if a threshold crossed before removing */
		__mem_cgroup_threshold(memcg, type == _MEMSWAP);

		/* Calculate new number of threshold */
		size = entries = 0;
		for (i = 0; i < thresholds->primary->size; i++) {
		if (thresholds->primary->entries[i].eventfd != eventfd)
		size++;
		else
		entries++;
		}

		new = thresholds->spare;

		/* If no items related to eventfd have been cleared, nothing to do */
		if (!entries)
		goto unlock;

		/* Set thresholds array to NULL if we don't have thresholds */
		if (!size) {
		kfree(new);
		new = NULL;
		goto swap_buffers;
		}

		new->size = size;

		/* Copy thresholds and find current threshold */
		new->current_threshold = -1;
		for (i = 0, j = 0; i < thresholds->primary->size; i++) {
		if (thresholds->primary->entries[i].eventfd == eventfd)
		continue;

		new->entries[j] = thresholds->primary->entries[i];
		if (new->entries[j].threshold <= usage) {
		/*
		* new->current_threshold will not be used
		* until rcu_assign_pointer(), so it's safe to increment
		* it here.
		*/
		++new->current_threshold;
		}
		j++;
		}

		swap_buffers:
		/* Swap primary and spare array */
		thresholds->spare = thresholds->primary;

		rcu_assign_pointer(thresholds->primary, new);

		/* To be sure that nobody uses thresholds */
		synchronize_rcu();

		/* If all events are unregistered, free the spare array */
		if (!new) {
		kfree(thresholds->spare);
		thresholds->spare = NULL;
		}
		unlock:
		mutex_unlock(&memcg->thresholds_lock);
		}

		static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
		struct eventfd_ctx *eventfd)
		{
		return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
		}

		static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
		struct eventfd_ctx *eventfd)
		{
		return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
		}

		static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
		struct eventfd_ctx eventfd, const char args)
		{
		struct mem_cgroup_eventfd_list *event;

		event = kmalloc(sizeof(*event), GFP_KERNEL);
		if (!event)
		return -ENOMEM;

		spin_lock(&memcg_oom_lock);

		event->eventfd = eventfd;
		list_add(&event->list, &memcg->oom_notify);

		/* already in OOM ? */
		if (memcg->under_oom)
		eventfd_signal(eventfd);
		spin_unlock(&memcg_oom_lock);

		return 0;
		}

		static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
		struct eventfd_ctx *eventfd)
		{
		struct mem_cgroup_eventfd_list ev, tmp;

		spin_lock(&memcg_oom_lock);

		list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
		if (ev->eventfd == eventfd) {
		list_del(&ev->list);
		kfree(ev);
		}
		}

		spin_unlock(&memcg_oom_lock);
		}

		/*
		* DO NOT USE IN NEW FILES.
		*
		* "cgroup.event_control" implementation.
		*
		* This is way over-engineered. It tries to support fully configurable
		* events for each user. Such level of flexibility is completely
		* unnecessary especially in the light of the planned unified hierarchy.
		*
		* Please deprecate this and replace with something simpler if at all
		* possible.
		*/

		/*
		* Unregister event and free resources.
		*
		* Gets called from workqueue.
		*/
		static void memcg_event_remove(struct work_struct *work)
		{
		struct mem_cgroup_event *event =
		container_of(work, struct mem_cgroup_event, remove);
		struct mem_cgroup *memcg = event->memcg;

		remove_wait_queue(event->wqh, &event->wait);

		event->unregister_event(memcg, event->eventfd);

		/* Notify userspace the event is going away. */
		eventfd_signal(event->eventfd);

		eventfd_ctx_put(event->eventfd);
		kfree(event);
		css_put(&memcg->css);
		}

		/*
		* Gets called on EPOLLHUP on eventfd when user closes it.
		*
		* Called with wqh->lock held and interrupts disabled.
		*/
		static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
		int sync, void *key)
		{
		struct mem_cgroup_event *event =
		container_of(wait, struct mem_cgroup_event, wait);
		struct mem_cgroup *memcg = event->memcg;
		__poll_t flags = key_to_poll(key);

		if (flags & EPOLLHUP) {
		/*
		* If the event has been detached at cgroup removal, we
		* can simply return knowing the other side will cleanup
		* for us.
		*
		* We can't race against event freeing since the other
		* side will require wqh->lock via remove_wait_queue(),
		* which we hold.
		*/
		spin_lock(&memcg->event_list_lock);
		if (!list_empty(&event->list)) {
		list_del_init(&event->list);
		/*
		* We are in atomic context, but cgroup_event_remove()
		* may sleep, so we have to call it in workqueue.
		*/
		schedule_work(&event->remove);
		}
		spin_unlock(&memcg->event_list_lock);
		}

		return 0;
		}

		static void memcg_event_ptable_queue_proc(struct file *file,
		wait_queue_head_t wqh, poll_table pt)
		{
		struct mem_cgroup_event *event =
		container_of(pt, struct mem_cgroup_event, pt);

		event->wqh = wqh;
		add_wait_queue(wqh, &event->wait);
		}

		/*
		* DO NOT USE IN NEW FILES.
		*
		* Parse input and register new cgroup event handler.
		*
		* Input must be in format '<event_fd> <control_fd> <args>'.
		* Interpretation of args is defined by control file implementation.
		*/
		ssize_t memcg_write_event_control(struct kernfs_open_file *of,
		char *buf, size_t nbytes, loff_t off)
		{
		struct cgroup_subsys_state *css = of_css(of);
		struct mem_cgroup *memcg = mem_cgroup_from_css(css);
		struct mem_cgroup_event *event;
		struct cgroup_subsys_state *cfile_css;
		unsigned int efd, cfd;
		struct fd efile;
		struct fd cfile;
		struct dentry *cdentry;
		const char *name;
		char *endp;
		int ret;

		if (IS_ENABLED(CONFIG_PREEMPT_RT))
		return -EOPNOTSUPP;

		buf = strstrip(buf);

		efd = simple_strtoul(buf, &endp, 10);
		if (*endp != ' ')
		return -EINVAL;
		buf = endp + 1;

		cfd = simple_strtoul(buf, &endp, 10);
		if ((endp != ' ') && (endp != '\0'))
		return -EINVAL;
		buf = endp + 1;

		event = kzalloc(sizeof(*event), GFP_KERNEL);
		if (!event)
		return -ENOMEM;

		event->memcg = memcg;
		INIT_LIST_HEAD(&event->list);
		init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
		init_waitqueue_func_entry(&event->wait, memcg_event_wake);
		INIT_WORK(&event->remove, memcg_event_remove);

		efile = fdget(efd);
		if (!efile.file) {
		ret = -EBADF;
		goto out_kfree;
		}

		event->eventfd = eventfd_ctx_fileget(efile.file);
		if (IS_ERR(event->eventfd)) {
		ret = PTR_ERR(event->eventfd);
		goto out_put_efile;
		}

		cfile = fdget(cfd);
		if (!cfile.file) {
		ret = -EBADF;
		goto out_put_eventfd;
		}

		/* the process need read permission on control file */
		/* AV: shouldn't we check that it's been opened for read instead? */
		ret = file_permission(cfile.file, MAY_READ);
		if (ret < 0)
		goto out_put_cfile;

		/*
		* The control file must be a regular cgroup1 file. As a regular cgroup
		* file can't be renamed, it's safe to access its name afterwards.
		*/
		cdentry = cfile.file->f_path.dentry;
		if (cdentry->d_sb->s_type != &cgroup_fs_type \|\| !d_is_reg(cdentry)) {
		ret = -EINVAL;
		goto out_put_cfile;
		}

		/*
		* Determine the event callbacks and set them in @event. This used
		* to be done via struct cftype but cgroup core no longer knows
		* about these events. The following is crude but the whole thing
		* is for compatibility anyway.
		*
		* DO NOT ADD NEW FILES.
		*/
		name = cdentry->d_name.name;

		if (!strcmp(name, "memory.usage_in_bytes")) {
		event->register_event = mem_cgroup_usage_register_event;
		event->unregister_event = mem_cgroup_usage_unregister_event;
		} else if (!strcmp(name, "memory.oom_control")) {
		event->register_event = mem_cgroup_oom_register_event;
		event->unregister_event = mem_cgroup_oom_unregister_event;
		} else if (!strcmp(name, "memory.pressure_level")) {
		event->register_event = vmpressure_register_event;
		event->unregister_event = vmpressure_unregister_event;
		} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
		event->register_event = memsw_cgroup_usage_register_event;
		event->unregister_event = memsw_cgroup_usage_unregister_event;
		} else {
		ret = -EINVAL;
		goto out_put_cfile;
		}

		/*
		* Verify @cfile should belong to @css. Also, remaining events are
		* automatically removed on cgroup destruction but the removal is
		* asynchronous, so take an extra ref on @css.
		*/
		cfile_css = css_tryget_online_from_dir(cdentry->d_parent,
		&memory_cgrp_subsys);
		ret = -EINVAL;
		if (IS_ERR(cfile_css))
		goto out_put_cfile;
		if (cfile_css != css) {
		css_put(cfile_css);
		goto out_put_cfile;
		}

		ret = event->register_event(memcg, event->eventfd, buf);
		if (ret)
		goto out_put_css;

		vfs_poll(efile.file, &event->pt);

		spin_lock_irq(&memcg->event_list_lock);
		list_add(&event->list, &memcg->event_list);
		spin_unlock_irq(&memcg->event_list_lock);

		fdput(cfile);
		fdput(efile);

		return nbytes;

		out_put_css:
		css_put(css);
		out_put_cfile:
		fdput(cfile);
		out_put_eventfd:
		eventfd_ctx_put(event->eventfd);
		out_put_efile:
		fdput(efile);
		out_kfree:
		kfree(event);

		return ret;
		}

		void memcg1_css_offline(struct mem_cgroup *memcg)
		{
		struct mem_cgroup_event event, tmp;

		/*
		* Unregister events and notify userspace.
		* Notify userspace about cgroup removing only after rmdir of cgroup
		* directory to avoid race between userspace and kernelspace.
		*/
		spin_lock_irq(&memcg->event_list_lock);
		list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
		list_del_init(&event->list);
		schedule_work(&event->remove);
		}
		spin_unlock_irq(&memcg->event_list_lock);
		}

		static int __init memcg1_init(void)
		{
		int node;

mm/memcontrol-v1.h

+51 −0

Original line number	Diff line number	Diff line
		@@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
		int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
		struct cftype *cft, u64 val);

		/*
		* Per memcg event counter is incremented at every pagein/pageout. With THP,
		* it will be incremented by the number of pages. This counter is used
		* to trigger some periodic events. This is straightforward and better
		* than using jiffies etc. to handle periodic memcg event.
		*/
		enum mem_cgroup_events_target {
		MEM_CGROUP_TARGET_THRESH,
		MEM_CGROUP_TARGET_SOFTLIMIT,
		MEM_CGROUP_NTARGETS,
		};

		/* Whether legacy memory+swap accounting is active */
		static bool do_memsw_account(void)
		{
		return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
		}

		/*
		* Iteration constructs for visiting all cgroups (under a tree). If
		* loops are exited prematurely (break), mem_cgroup_iter_break() must
		* be used for reference counting.
		*/
		#define for_each_mem_cgroup_tree(iter, root) \
		for (iter = mem_cgroup_iter(root, NULL, NULL); \
		iter != NULL; \
		iter = mem_cgroup_iter(root, iter, NULL))

		#define for_each_mem_cgroup(iter) \
		for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
		iter != NULL; \
		iter = mem_cgroup_iter(NULL, iter, NULL))

		void memcg1_css_offline(struct mem_cgroup *memcg);

		/* for encoding cft->private value on file */
		enum res_type {
		_MEM,
		_MEMSWAP,
		_KMEM,
		_TCP,
		};

		bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
		enum mem_cgroup_events_target target);
		unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
		void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
		ssize_t memcg_write_event_control(struct kernfs_open_file *of,
		char *buf, size_t nbytes, loff_t off);


		#endif /* __MM_MEMCONTROL_V1_H */

mm/memcontrol.c

+5 −682

File changed.

Preview size limit exceeded, changes collapsed.