summaryrefslogtreecommitdiff
path: root/include/linux/memcontrol.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/memcontrol.h')
-rw-r--r--include/linux/memcontrol.h277
1 files changed, 211 insertions, 66 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 977edd3b7bd8..d0b036123c6a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,19 +23,16 @@
#include <linux/page-flags.h>
struct mem_cgroup;
+struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
- MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- MEMCG_SWAP,
+ MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
- /* XXX: why are these zone and not node counters? */
- MEMCG_KERNEL_STACK_KB,
+ MEMCG_PERCPU_B,
MEMCG_NR_STAT,
};
@@ -45,17 +42,12 @@ enum memcg_memory_event {
MEMCG_MAX,
MEMCG_OOM,
MEMCG_OOM_KILL,
+ MEMCG_SWAP_HIGH,
MEMCG_SWAP_MAX,
MEMCG_SWAP_FAIL,
MEMCG_NR_MEMORY_EVENTS,
};
-enum mem_cgroup_protection {
- MEMCG_PROT_NONE,
- MEMCG_PROT_LOW,
- MEMCG_PROT_MIN,
-};
-
struct mem_cgroup_reclaim_cookie {
pg_data_t *pgdat;
unsigned int generation;
@@ -73,8 +65,8 @@ struct mem_cgroup_id {
/*
* Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
+ * it will be incremented by the number of pages. This counter is used
+ * to trigger some periodic events. This is straightforward and better
* than using jiffies etc. to handle periodic memcg event.
*/
enum mem_cgroup_events_target {
@@ -195,6 +187,22 @@ struct memcg_cgwb_frn {
};
/*
+ * Bucket for arbitrarily byte-sized objects charged to a memory
+ * cgroup. The bucket can be reparented in one piece when the cgroup
+ * is destroyed, without having to round up the individual references
+ * of all live memory objects in the wild.
+ */
+struct obj_cgroup {
+ struct percpu_ref refcnt;
+ struct mem_cgroup *memcg;
+ atomic_t nr_charged_bytes;
+ union {
+ struct list_head list;
+ struct rcu_head rcu;
+ };
+};
+
+/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -215,9 +223,6 @@ struct mem_cgroup {
struct page_counter kmem;
struct page_counter tcpmem;
- /* Upper bound of normal memory consumption range */
- unsigned long high;
-
/* Range enforcement for interrupt charges */
struct work_struct high_work;
@@ -305,7 +310,8 @@ struct mem_cgroup {
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
enum memcg_kmem_state kmem_state;
- struct list_head kmem_caches;
+ struct obj_cgroup __rcu *objcg;
+ struct list_head objcg_list; /* list of inherited objcgs */
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -334,6 +340,13 @@ struct mem_cgroup {
extern struct mem_cgroup *root_mem_cgroup;
+static __always_inline bool memcg_stat_item_in_bytes(int idx)
+{
+ if (idx == MEMCG_PERCPU_B)
+ return true;
+ return vmstat_item_in_bytes(idx);
+}
+
static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
return (memcg == root_mem_cgroup);
@@ -344,12 +357,49 @@ static inline bool mem_cgroup_disabled(void)
return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
bool in_low_reclaim)
{
if (mem_cgroup_disabled())
return 0;
+ /*
+ * There is no reclaim protection applied to a targeted reclaim.
+ * We are special casing this specific case here because
+ * mem_cgroup_protected calculation is not robust enough to keep
+ * the protection invariant for calculated effective values for
+ * parallel reclaimers with different reclaim target. This is
+ * especially a problem for tail memcgs (as they have pages on LRU)
+ * which would want to have effective values 0 for targeted reclaim
+ * but a different value for external reclaim.
+ *
+ * Example
+ * Let's have global and A's reclaim in parallel:
+ * |
+ * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
+ * |\
+ * | C (low = 1G, usage = 2.5G)
+ * B (low = 1G, usage = 0.5G)
+ *
+ * For the global reclaim
+ * A.elow = A.low
+ * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
+ * C.elow = min(C.usage, C.low)
+ *
+ * With the effective values resetting we have A reclaim
+ * A.elow = 0
+ * B.elow = B.low
+ * C.elow = C.low
+ *
+ * If the global reclaim races with A's reclaim then
+ * B.elow = C.elow = 0 because children_low_usage > A.elow)
+ * is possible and reclaiming B would be violating the protection.
+ *
+ */
+ if (root == memcg)
+ return 0;
+
if (in_low_reclaim)
return READ_ONCE(memcg->memory.emin);
@@ -357,19 +407,39 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
READ_ONCE(memcg->memory.elow));
}
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg);
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg);
+
+static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg)
+{
+ /*
+ * The root memcg doesn't account charges, and doesn't support
+ * protection.
+ */
+ return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg);
+
+}
+
+static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
+{
+ if (!mem_cgroup_supports_protection(memcg))
+ return false;
+
+ return READ_ONCE(memcg->memory.elow) >=
+ page_counter_read(&memcg->memory);
+}
+
+static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
+{
+ if (!mem_cgroup_supports_protection(memcg))
+ return false;
+
+ return READ_ONCE(memcg->memory.emin) >=
+ page_counter_read(&memcg->memory);
+}
+
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound);
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound);
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare, bool compound);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
- bool compound);
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -429,6 +499,33 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}
+static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
+{
+ return percpu_ref_tryget(&objcg->refcnt);
+}
+
+static inline void obj_cgroup_get(struct obj_cgroup *objcg)
+{
+ percpu_ref_get(&objcg->refcnt);
+}
+
+static inline void obj_cgroup_put(struct obj_cgroup *objcg)
+{
+ percpu_ref_put(&objcg->refcnt);
+}
+
+/*
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
+ *
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+{
+ return READ_ONCE(objcg->memcg);
+}
+
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
@@ -533,7 +630,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
struct mem_cgroup_per_node *mz;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- return mz->lru_zone_size[zone_idx][lru];
+ return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}
void mem_cgroup_handle_over_high(void);
@@ -570,7 +667,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
#ifdef CONFIG_MEMCG_SWAP
-extern int do_swap_account;
+extern bool cgroup_memory_noswap;
#endif
struct mem_cgroup *lock_page_memcg(struct page *page);
@@ -692,11 +789,34 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return x;
}
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val);
void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
int val);
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val);
+
void mod_memcg_obj_state(void *p, int idx, int val);
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+ int val)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_lruvec_slab_state(p, idx, val);
+ local_irq_restore(flags);
+}
+
+static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx, int val)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_memcg_lruvec_state(lruvec, idx, val);
+ local_irq_restore(flags);
+}
+
static inline void mod_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
@@ -710,16 +830,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
static inline void __mod_lruvec_page_state(struct page *page,
enum node_stat_item idx, int val)
{
+ struct page *head = compound_head(page); /* rmap on tail pages */
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
/* Untracked pages have no memcg, no lruvec. Update only the node */
- if (!page->mem_cgroup) {
+ if (!head->mem_cgroup) {
__mod_node_page_state(pgdat, idx, val);
return;
}
- lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat);
+ lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
@@ -837,47 +958,32 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
{
}
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
+static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg,
bool in_low_reclaim)
{
return 0;
}
-static inline enum mem_cgroup_protection mem_cgroup_protected(
- struct mem_cgroup *root, struct mem_cgroup *memcg)
-{
- return MEMCG_PROT_NONE;
-}
-
-static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask,
- struct mem_cgroup **memcgp,
- bool compound)
+static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
- *memcgp = NULL;
- return 0;
}
-static inline int mem_cgroup_try_charge_delay(struct page *page,
- struct mm_struct *mm,
- gfp_t gfp_mask,
- struct mem_cgroup **memcgp,
- bool compound)
+static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg)
{
- *memcgp = NULL;
- return 0;
+ return false;
}
-static inline void mem_cgroup_commit_charge(struct page *page,
- struct mem_cgroup *memcg,
- bool lrucare, bool compound)
+static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
{
+ return false;
}
-static inline void mem_cgroup_cancel_charge(struct page *page,
- struct mem_cgroup *memcg,
- bool compound)
+static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
+ return 0;
}
static inline void mem_cgroup_uncharge(struct page *page)
@@ -1094,6 +1200,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
}
+static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+ enum node_stat_item idx, int val)
+{
+}
+
static inline void __mod_lruvec_state(struct lruvec *lruvec,
enum node_stat_item idx, int val)
{
@@ -1126,6 +1237,14 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx,
__mod_node_page_state(page_pgdat(page), idx, val);
}
+static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx,
+ int val)
+{
+ struct page *page = virt_to_head_page(p);
+
+ mod_node_page_state(page_pgdat(page), idx, val);
+}
+
static inline void mod_memcg_obj_state(void *p, int idx, int val)
{
}
@@ -1279,6 +1398,19 @@ static inline void dec_lruvec_page_state(struct page *page,
mod_lruvec_page_state(page, idx, -1);
}
+static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = lruvec_memcg(lruvec);
+ if (!memcg)
+ return NULL;
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ return NULL;
+ return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
+}
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
@@ -1365,9 +1497,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
}
#endif
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
-void memcg_kmem_put_cache(struct kmem_cache *cachep);
-
#ifdef CONFIG_MEMCG_KMEM
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
unsigned int nr_pages);
@@ -1375,8 +1504,12 @@ void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);
+struct obj_cgroup *get_obj_cgroup_from_current(void);
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
+
extern struct static_key_false memcg_kmem_enabled_key;
-extern struct workqueue_struct *memcg_kmem_cache_wq;
extern int memcg_nr_cache_ids;
void memcg_get_cache_ids(void);
@@ -1392,7 +1525,19 @@ void memcg_put_cache_ids(void);
static inline bool memcg_kmem_enabled(void)
{
- return static_branch_unlikely(&memcg_kmem_enabled_key);
+ return static_branch_likely(&memcg_kmem_enabled_key);
+}
+
+static inline bool memcg_kmem_bypass(void)
+{
+ if (in_interrupt())
+ return true;
+
+ /* Allow remote memcg charging in kthread contexts. */
+ if ((!current->mm || (current->flags & PF_KTHREAD)) &&
+ !current->active_memcg)
+ return true;
+ return false;
}
static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,