mm: memcg: move cgroup v1 oom handling code into memcontrol-v1.c (292fc2e0) · Commits · git / linux-nf

mm/memcontrol-v1.c

+227 −2

Original line number	Diff line number	Diff line
		@@ -110,7 +110,13 @@ struct mem_cgroup_event {
		struct work_struct remove;
		};

		extern spinlock_t memcg_oom_lock;
		#ifdef CONFIG_LOCKDEP
		static struct lockdep_map memcg_oom_lock_dep_map = {
		.name = "memcg_oom_lock",
		};
		#endif

		DEFINE_SPINLOCK(memcg_oom_lock);

		static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
		struct mem_cgroup_tree_per_node *mctz,
		@@ -1469,7 +1475,7 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
		return 0;
		}

		void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
		static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		@@ -1959,6 +1965,225 @@ void memcg1_css_offline(struct mem_cgroup *memcg)
		spin_unlock_irq(&memcg->event_list_lock);
		}

		/*
		* Check OOM-Killer is already running under our hierarchy.
		* If someone is running, return false.
		*/
		static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
		{
		struct mem_cgroup iter, failed = NULL;

		spin_lock(&memcg_oom_lock);

		for_each_mem_cgroup_tree(iter, memcg) {
		if (iter->oom_lock) {
		/*
		* this subtree of our hierarchy is already locked
		* so we cannot give a lock.
		*/
		failed = iter;
		mem_cgroup_iter_break(memcg, iter);
		break;
		} else
		iter->oom_lock = true;
		}

		if (failed) {
		/*
		* OK, we failed to lock the whole subtree so we have
		* to clean up what we set up to the failing subtree
		*/
		for_each_mem_cgroup_tree(iter, memcg) {
		if (iter == failed) {
		mem_cgroup_iter_break(memcg, iter);
		break;
		}
		iter->oom_lock = false;
		}
		} else
		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);

		spin_unlock(&memcg_oom_lock);

		return !failed;
		}

		static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		spin_lock(&memcg_oom_lock);
		mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
		for_each_mem_cgroup_tree(iter, memcg)
		iter->oom_lock = false;
		spin_unlock(&memcg_oom_lock);
		}

		static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		spin_lock(&memcg_oom_lock);
		for_each_mem_cgroup_tree(iter, memcg)
		iter->under_oom++;
		spin_unlock(&memcg_oom_lock);
		}

		static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		/*
		* Be careful about under_oom underflows because a child memcg
		* could have been added after mem_cgroup_mark_under_oom.
		*/
		spin_lock(&memcg_oom_lock);
		for_each_mem_cgroup_tree(iter, memcg)
		if (iter->under_oom > 0)
		iter->under_oom--;
		spin_unlock(&memcg_oom_lock);
		}

		static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

		struct oom_wait_info {
		struct mem_cgroup *memcg;
		wait_queue_entry_t wait;
		};

		static int memcg_oom_wake_function(wait_queue_entry_t *wait,
		unsigned mode, int sync, void *arg)
		{
		struct mem_cgroup wake_memcg = (struct mem_cgroup )arg;
		struct mem_cgroup *oom_wait_memcg;
		struct oom_wait_info *oom_wait_info;

		oom_wait_info = container_of(wait, struct oom_wait_info, wait);
		oom_wait_memcg = oom_wait_info->memcg;

		if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
		!mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
		return 0;
		return autoremove_wake_function(wait, mode, sync, arg);
		}

		void memcg_oom_recover(struct mem_cgroup *memcg)
		{
		/*
		* For the following lockless ->under_oom test, the only required
		* guarantee is that it must see the state asserted by an OOM when
		* this function is called as a result of userland actions
		* triggered by the notification of the OOM. This is trivially
		* achieved by invoking mem_cgroup_mark_under_oom() before
		* triggering notification.
		*/
		if (memcg && memcg->under_oom)
		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
		}

		/**
		* mem_cgroup_oom_synchronize - complete memcg OOM handling
		* @handle: actually kill/wait or just clean up the OOM state
		*
		* This has to be called at the end of a page fault if the memcg OOM
		* handler was enabled.
		*
		* Memcg supports userspace OOM handling where failed allocations must
		* sleep on a waitqueue until the userspace task resolves the
		* situation. Sleeping directly in the charge context with all kinds
		* of locks held is not a good idea, instead we remember an OOM state
		* in the task and mem_cgroup_oom_synchronize() has to be called at
		* the end of the page fault to complete the OOM handling.
		*
		* Returns %true if an ongoing memcg OOM situation was detected and
		* completed, %false otherwise.
		*/
		bool mem_cgroup_oom_synchronize(bool handle)
		{
		struct mem_cgroup *memcg = current->memcg_in_oom;
		struct oom_wait_info owait;
		bool locked;

		/* OOM is global, do not handle */
		if (!memcg)
		return false;

		if (!handle)
		goto cleanup;

		owait.memcg = memcg;
		owait.wait.flags = 0;
		owait.wait.func = memcg_oom_wake_function;
		owait.wait.private = current;
		INIT_LIST_HEAD(&owait.wait.entry);

		prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
		mem_cgroup_mark_under_oom(memcg);

		locked = mem_cgroup_oom_trylock(memcg);

		if (locked)
		mem_cgroup_oom_notify(memcg);

		schedule();
		mem_cgroup_unmark_under_oom(memcg);
		finish_wait(&memcg_oom_waitq, &owait.wait);

		if (locked)
		mem_cgroup_oom_unlock(memcg);
		cleanup:
		current->memcg_in_oom = NULL;
		css_put(&memcg->css);
		return true;
		}


		bool memcg1_oom_prepare(struct mem_cgroup memcg, bool locked)
		{
		/*
		* We are in the middle of the charge context here, so we
		* don't want to block when potentially sitting on a callstack
		* that holds all kinds of filesystem and mm locks.
		*
		* cgroup1 allows disabling the OOM killer and waiting for outside
		* handling until the charge can succeed; remember the context and put
		* the task to sleep at the end of the page fault when all locks are
		* released.
		*
		* On the other hand, in-kernel OOM killer allows for an async victim
		* memory reclaim (oom_reaper) and that means that we are not solely
		* relying on the oom victim to make a forward progress and we can
		* invoke the oom killer here.
		*
		* Please note that mem_cgroup_out_of_memory might fail to find a
		* victim and then we have to bail out from the charge path.
		*/
		if (READ_ONCE(memcg->oom_kill_disable)) {
		if (current->in_user_fault) {
		css_get(&memcg->css);
		current->memcg_in_oom = memcg;
		}
		return false;
		}

		mem_cgroup_mark_under_oom(memcg);

		*locked = mem_cgroup_oom_trylock(memcg);

		if (*locked)
		mem_cgroup_oom_notify(memcg);

		mem_cgroup_unmark_under_oom(memcg);

		return true;
		}

		void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
		{
		if (locked)
		mem_cgroup_oom_unlock(memcg);
		}

		static int __init memcg1_init(void)
		{
		int node;

mm/memcontrol-v1.h

+2 −1

Original line number	Diff line number	Diff line
		@@ -87,9 +87,10 @@ enum res_type {
		bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
		enum mem_cgroup_events_target target);
		unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
		void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
		ssize_t memcg_write_event_control(struct kernfs_open_file *of,
		char *buf, size_t nbytes, loff_t off);

		bool memcg1_oom_prepare(struct mem_cgroup memcg, bool locked);
		void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);

		#endif /* __MM_MEMCONTROL_V1_H */

mm/memcontrol.c

+2 −214

Original line number	Diff line number	Diff line
		@@ -1615,130 +1615,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
		return ret;
		}

		#ifdef CONFIG_LOCKDEP
		static struct lockdep_map memcg_oom_lock_dep_map = {
		.name = "memcg_oom_lock",
		};
		#endif

		DEFINE_SPINLOCK(memcg_oom_lock);

		/*
		* Check OOM-Killer is already running under our hierarchy.
		* If someone is running, return false.
		*/
		static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
		{
		struct mem_cgroup iter, failed = NULL;

		spin_lock(&memcg_oom_lock);

		for_each_mem_cgroup_tree(iter, memcg) {
		if (iter->oom_lock) {
		/*
		* this subtree of our hierarchy is already locked
		* so we cannot give a lock.
		*/
		failed = iter;
		mem_cgroup_iter_break(memcg, iter);
		break;
		} else
		iter->oom_lock = true;
		}

		if (failed) {
		/*
		* OK, we failed to lock the whole subtree so we have
		* to clean up what we set up to the failing subtree
		*/
		for_each_mem_cgroup_tree(iter, memcg) {
		if (iter == failed) {
		mem_cgroup_iter_break(memcg, iter);
		break;
		}
		iter->oom_lock = false;
		}
		} else
		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);

		spin_unlock(&memcg_oom_lock);

		return !failed;
		}

		static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		spin_lock(&memcg_oom_lock);
		mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
		for_each_mem_cgroup_tree(iter, memcg)
		iter->oom_lock = false;
		spin_unlock(&memcg_oom_lock);
		}

		static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		spin_lock(&memcg_oom_lock);
		for_each_mem_cgroup_tree(iter, memcg)
		iter->under_oom++;
		spin_unlock(&memcg_oom_lock);
		}

		static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
		{
		struct mem_cgroup *iter;

		/*
		* Be careful about under_oom underflows because a child memcg
		* could have been added after mem_cgroup_mark_under_oom.
		*/
		spin_lock(&memcg_oom_lock);
		for_each_mem_cgroup_tree(iter, memcg)
		if (iter->under_oom > 0)
		iter->under_oom--;
		spin_unlock(&memcg_oom_lock);
		}

		static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

		struct oom_wait_info {
		struct mem_cgroup *memcg;
		wait_queue_entry_t wait;
		};

		static int memcg_oom_wake_function(wait_queue_entry_t *wait,
		unsigned mode, int sync, void *arg)
		{
		struct mem_cgroup wake_memcg = (struct mem_cgroup )arg;
		struct mem_cgroup *oom_wait_memcg;
		struct oom_wait_info *oom_wait_info;

		oom_wait_info = container_of(wait, struct oom_wait_info, wait);
		oom_wait_memcg = oom_wait_info->memcg;

		if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
		!mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
		return 0;
		return autoremove_wake_function(wait, mode, sync, arg);
		}

		void memcg_oom_recover(struct mem_cgroup *memcg)
		{
		/*
		* For the following lockless ->under_oom test, the only required
		* guarantee is that it must see the state asserted by an OOM when
		* this function is called as a result of userland actions
		* triggered by the notification of the OOM. This is trivially
		* achieved by invoking mem_cgroup_mark_under_oom() before
		* triggering notification.
		*/
		if (memcg && memcg->under_oom)
		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
		}

		/*
		* Returns true if successfully killed one or more processes. Though in some
		* corner cases it can return true even without killing any process.
		@@ -1752,104 +1628,16 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)

		memcg_memory_event(memcg, MEMCG_OOM);

		/*
		* We are in the middle of the charge context here, so we
		* don't want to block when potentially sitting on a callstack
		* that holds all kinds of filesystem and mm locks.
		*
		* cgroup1 allows disabling the OOM killer and waiting for outside
		* handling until the charge can succeed; remember the context and put
		* the task to sleep at the end of the page fault when all locks are
		* released.
		*
		* On the other hand, in-kernel OOM killer allows for an async victim
		* memory reclaim (oom_reaper) and that means that we are not solely
		* relying on the oom victim to make a forward progress and we can
		* invoke the oom killer here.
		*
		* Please note that mem_cgroup_out_of_memory might fail to find a
		* victim and then we have to bail out from the charge path.
		*/
		if (READ_ONCE(memcg->oom_kill_disable)) {
		if (current->in_user_fault) {
		css_get(&memcg->css);
		current->memcg_in_oom = memcg;
		}
		if (!memcg1_oom_prepare(memcg, &locked))
		return false;
		}

		mem_cgroup_mark_under_oom(memcg);

		locked = mem_cgroup_oom_trylock(memcg);

		if (locked)
		mem_cgroup_oom_notify(memcg);

		mem_cgroup_unmark_under_oom(memcg);
		ret = mem_cgroup_out_of_memory(memcg, mask, order);

		if (locked)
		mem_cgroup_oom_unlock(memcg);
		memcg1_oom_finish(memcg, locked);

		return ret;
		}

		/**
		* mem_cgroup_oom_synchronize - complete memcg OOM handling
		* @handle: actually kill/wait or just clean up the OOM state
		*
		* This has to be called at the end of a page fault if the memcg OOM
		* handler was enabled.
		*
		* Memcg supports userspace OOM handling where failed allocations must
		* sleep on a waitqueue until the userspace task resolves the
		* situation. Sleeping directly in the charge context with all kinds
		* of locks held is not a good idea, instead we remember an OOM state
		* in the task and mem_cgroup_oom_synchronize() has to be called at
		* the end of the page fault to complete the OOM handling.
		*
		* Returns %true if an ongoing memcg OOM situation was detected and
		* completed, %false otherwise.
		*/
		bool mem_cgroup_oom_synchronize(bool handle)
		{
		struct mem_cgroup *memcg = current->memcg_in_oom;
		struct oom_wait_info owait;
		bool locked;

		/* OOM is global, do not handle */
		if (!memcg)
		return false;

		if (!handle)
		goto cleanup;

		owait.memcg = memcg;
		owait.wait.flags = 0;
		owait.wait.func = memcg_oom_wake_function;
		owait.wait.private = current;
		INIT_LIST_HEAD(&owait.wait.entry);

		prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
		mem_cgroup_mark_under_oom(memcg);

		locked = mem_cgroup_oom_trylock(memcg);

		if (locked)
		mem_cgroup_oom_notify(memcg);

		schedule();
		mem_cgroup_unmark_under_oom(memcg);
		finish_wait(&memcg_oom_waitq, &owait.wait);

		if (locked)
		mem_cgroup_oom_unlock(memcg);
		cleanup:
		current->memcg_in_oom = NULL;
		css_put(&memcg->css);
		return true;
		}

		/**
		* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
		* @victim: task to be killed by the OOM killer