slubのコードを読む：__memcg_kmem_get_cacheの処理を読む - φ(・・*)ゞｳｰﾝ　カーネルとか弄ったりのメモ

この記事はLinux Advent Calendar 2014の3日目の記事です。

kmem_cache_zalloc()、名前だけ見ると単純そうな気がするけど割とそうでもないのでした。この関数は最初のほうは別の関数へのラッパー程度になっていますが、memcg_kmem_get_cache()、__memcg_kmem_get_cache()辺りでちゃんとした処理になります。

これらは kmem_cache_zalloc()から呼ばれるのでまずはdo_kmem_cache_create()が kmem_cache_zalloc()]を呼んでいるのでここからスタートします。一番目の引数kmem_cacheはstruct kmem_cache *な変数でmm/slab_common.cで定義されていますが、mm/slab.hの方でextern付きで宣言されています。これはカーネルの初期化のタイミングで設定される変数だと思います。変数名と構造体名が一緒なのでlxrで使っている箇所を探しづらい・・・

136 static struct kmem_cache *
137 do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
138                      unsigned long flags, void (*ctor)(void *),
139                      struct mem_cgroup *memcg, struct kmem_cache *root_cache)
140 {
141         struct kmem_cache *s;
142         int err;
143 
144         err = -ENOMEM;
145         s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
146         if (!s)
147                 goto out;

kmem_cache_zalloc()はこのような関数でGFPフラグに__GFP_ZEROをつけてて納得の処理です。

627 static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
628 {
629         return kmem_cache_alloc(k, flags | __GFP_ZERO);
630 }
631

kmem_cache_alloc()はこうです。ここは単にslab_alloc()を呼ぶだけです。

2457 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2458 {
2459         void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2460 
2461         trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
2462                                 s->size, gfpflags);
2463 
2464         return ret;
2465 }

次のslab_alloc()はslab_alloc_node()を呼びます。これもNUMAがあるから適切なnodeからメモリを確保したいよねという感じですね。

2451 static __always_inline void *slab_alloc(struct kmem_cache *s,
2452                 gfp_t gfpflags, unsigned long addr)
2453 {
2454         return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2455 }

slab_alloc_node()の最初のほうでmemcg_kmem_get_cache()が呼ばれます。slab_pre_alloc_hook()はここでは気にしなくてもOKです。

2373 static __always_inline void *slab_alloc_node(struct kmem_cache *s,
2374                 gfp_t gfpflags, int node, unsigned long addr)
2375 {
2376         void **object;
2377         struct kmem_cache_cpu *c;
2378         struct page *page;
2379         unsigned long tid;
2380 
2381         if (slab_pre_alloc_hook(s, gfpflags))
2382                 return NULL;
2383 
2384         s = memcg_kmem_get_cache(s, gfpflags);
2385 redo:

そしてmemcg_kmem_get_cache()です。一つ一つのif文の意味はあとにして、ここで何をするかというと、cachepを返すパターンというのはkmem_cache_zalloc()の第1引数を返すということで、グローバルに設定されているkmem_cache変数をスラブの管理に使う感じですね。そうじゃない場合に__memcg_kmem_get_cache()が呼ばれます。

534 static __always_inline struct kmem_cache *
535 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
536 {
537         if (!memcg_kmem_enabled())
538                 return cachep;
539         if (gfp & __GFP_NOFAIL)
540                 return cachep;
541         if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
542                 return cachep;
543         if (unlikely(fatal_signal_pending(current)))
544                 return cachep;
545 
546         return __memcg_kmem_get_cache(cachep, gfp);
547 }

__memcg_kmem_get_cache()を見ていきます。最初は飛ばして・・・

3249 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3250                                           gfp_t gfp)
3251 {
3252         struct mem_cgroup *memcg;
3253         struct kmem_cache *memcg_cachep;
3254 
3255         VM_BUG_ON(!cachep->memcg_params);
3256         VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3257

まず、current->mmがNULLな場合というのはcurrentプロセスがカーネルスレッドの場合でこの場合はcachepを返す。もしくは current->memcg_kmem_skip_accounが0じゃない場合のところですが、これは後々出てくるんだけど、memcg_schedule_register_cache()が内部でkmalloc()を使います。この時に（kmallocはスラブキャッシュから適切なサイズのメモリを返します）取得しようとしたサイズにあうキャッシュが存在しなかった場合、キャッシュの作成処理が必要なのですが、そうするとスラブキャッシュの作成処理でまた__memcg_kmem_get_cacheが呼ばれてくるのでその時はcachepを返しているというようです。

3258         if (!current->mm || current->memcg_kmem_skip_account)
3259                 return cachep;
3260

ここはstruct task_structのcurrentからmem_cgroupを取得するだけ。

3261         rcu_read_lock();
3262         memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3263

memory cgroupでメモリの使用量をアカウンティングできるかどうかのチェック。

3264         if (!memcg_can_account_kmem(memcg))
3265                 goto out;
3266

cache_from_memcg_idx() ここはstruct kmem_cache構造体が持っているmemory cgroupからキャッシュがあるか調べて見つかったらそれをreturnする。

3267         memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3268         if (likely(memcg_cachep)) {
3269                 cachep = memcg_cachep;
3270                 goto out;
3271         }
3272

cssとはcgroup subsys stateの模様。渡しているのはstruct cgroup_subsys_state *。

3273         /* The corresponding put will be done in the workqueue. */
3274         if (!css_tryget_online(&memcg->css))
3275                 goto out;
3276         rcu_read_unlock();
3277

ここでの最後の処理はmemcg_schedule_register_cache()を呼んでキャッシュの登録。

3278         /*
3279          * If we are in a safe context (can wait, and not in interrupt
3280          * context), we could be be predictable and return right away.
3281          * This would guarantee that the allocation being performed
3282          * already belongs in the new cache.
3283          *
3284          * However, there are some clashes that can arrive from locking.
3285          * For instance, because we acquire the slab_mutex while doing
3286          * memcg_create_kmem_cache, this means no further allocation
3287          * could happen with the slab_mutex held. So it's better to
3288          * defer everything.
3289          */
3290         memcg_schedule_register_cache(memcg, cachep);
3291         return cachep;
3292 out:
3293         rcu_read_unlock();
3294         return cachep;
3295 }

memcg_schedule_register_cache()を見る前にcache_from_memcg_idx()を先に。ここはほぼ見ての通りで、params->memcg_caches[idx]がcachepに設定される。

152 static inline struct kmem_cache *
153 cache_from_memcg_idx(struct kmem_cache *s, int idx)
154 {
155         struct kmem_cache *cachep;
156         struct memcg_cache_params *params;
157 
158         if (!s->memcg_params)
159                 return NULL;
160 
161         rcu_read_lock();
162         params = rcu_dereference(s->memcg_params);
163         cachep = params->memcg_caches[idx];
164         rcu_read_unlock();
165 
166         /*
167          * Make sure we will access the up-to-date value. The code updating
168          * memcg_caches issues a write barrier to match this (see
169          * memcg_register_cache()).
170          */
171         smp_read_barrier_depends();
172         return cachep;
173 }

ここでデータ構造を確認しておくとstruct kmem_cache構造体にmemcg_paramsという構造体があって、cache_from_memcg_idx()で見ているのはグローバル変数のkmem_cacheに設定されているmemcg_params。

 87 #ifdef CONFIG_MEMCG_KMEM
 88         struct memcg_cache_params *memcg_params;

struct memcg_paramsはこのような構造体。

531 struct memcg_cache_params {
532         bool is_root_cache;
533         union {
534                 struct {
535                         struct rcu_head rcu_head;
536                         struct kmem_cache *memcg_caches[0];
537                 };
538                 struct {
539                         struct mem_cgroup *memcg;
540                         struct list_head list;
541                         struct kmem_cache *root_cache;
542                         atomic_t nr_pages;
543                 };
544         };
545 };

で、memcg_schedule_register_cache()に進すむ。ここはmemcg_schedule_register_cache()の前後でアカウンティングのストップとリジュームがありますねという程度。なぜこれをやっているかはコメント参照。kmalloc云々というのは先程のmemcg_kmem_get_cache()でアカウンティングできるかどうか調べていたところに関係してます。

3200 static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3201                                           struct kmem_cache *cachep)
3202 {
3203         /*
3204          * We need to stop accounting when we kmalloc, because if the
3205          * corresponding kmalloc cache is not yet created, the first allocation
3206          * in __memcg_schedule_register_cache will recurse.
3207          *
3208          * However, it is better to enclose the whole function. Depending on
3209          * the debugging options enabled, INIT_WORK(), for instance, can
3210          * trigger an allocation. This too, will make us recurse. Because at
3211          * this point we can't allow ourselves back into memcg_kmem_get_cache,
3212          * the safest choice is to do it like this, wrapping the whole function.
3213          */
3214         memcg_stop_kmem_account();
3215         __memcg_schedule_register_cache(memcg, cachep);
3216         memcg_resume_kmem_account();
3217 }
3218

そして__memcg_schedule_register_cache()。ここでやりたいのはワークキューに突っ込むデータを作って時が来たらmemcg_register_cache_func()を実行するというもの。

3182 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
3183                                             struct kmem_cache *cachep)
3184 {
3185         struct memcg_register_cache_work *cw;
3186 
3187         cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
3188         if (cw == NULL) {
3189                 css_put(&memcg->css);
3190                 return;
3191         }
3192 
3193         cw->memcg = memcg;
3194         cw->cachep = cachep;
3195 
3196         INIT_WORK(&cw->work, memcg_register_cache_func);
3197         schedule_work(&cw->work);
3198 }

memcy_register_cach_func()はこれで、主要なのはmemcg_register_cache()。

3164 static void memcg_register_cache_func(struct work_struct *w)
3165 {
3166         struct memcg_register_cache_work *cw =
3167                 container_of(w, struct memcg_register_cache_work, work);
3168         struct mem_cgroup *memcg = cw->memcg;
3169         struct kmem_cache *cachep = cw->cachep;
3170 
3171         mutex_lock(&memcg_slab_mutex);
3172         memcg_register_cache(memcg, cachep);
3173         mutex_unlock(&memcg_slab_mutex);
3174 
3175         css_put(&memcg->css);
3176         kfree(cw);
3177 }

memcg_resister_cache()はこのような関数。

3024 static void memcg_register_cache(struct mem_cgroup *memcg,
3025                                  struct kmem_cache *root_cache)
3026 {
3027         static char memcg_name_buf[NAME_MAX + 1]; /* protected by
3028                                                      memcg_slab_mutex */
3029         struct kmem_cache *cachep;
3030         int id;
3031 
3032         lockdep_assert_held(&memcg_slab_mutex);
3033

memory cgroupは構造体にidを降って関しているらしい。

3034         id = memcg_cache_id(memcg);
3035

またcache_from_memcg_idx()を呼んでいるけ。ここで渡しているのは今までも使っていたcachep。ここで見つかればそれで良しということか。戻り値そのものは捨てているか。

3036         /*
3037          * Since per-memcg caches are created asynchronously on first
3038          * allocation (see memcg_kmem_get_cache()), several threads can try to
3039          * create the same cache, but only one of them may succeed.
3040          */
3041         if (cache_from_memcg_idx(root_cache, id))
3042                 return;
3043

これは本当にcgroupに名前を設定するだけか。

3044         cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);

memcg_create_kmem_cache()はその名の通りの処理でしょう。詳細は後ほど。

3045         cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
3046         /*
3047          * If we could not create a memcg cache, do not complain, because
3048          * that's not critical at all as we can always proceed with the root
3049          * cache.
3050          */
3051         if (!cachep)
3052                 return;
3053

キャッシュが作れたらリストに登録。

3054         list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
3055 
3056         /*
3057          * Since readers won't lock (see cache_from_memcg_idx()), we need a
3058          * barrier here to ensure nobody will see the kmem_cache partially
3059          * initialized.
3060          */
3061         smp_wmb();
3062

最後にmemcg_paramsのmemcg_cachesのid番目にcachepを設定する。

3063         BUG_ON(root_cache->memcg_params->memcg_caches[id]);
3064         root_cache->memcg_params->memcg_caches[id] = cachep;
3065 }

memcg_create_kmem_cache()です。やっているのはキャッシュの名前を設定してdo_kmem_cache_create()を呼ぶ。そして、また今日の日記の最初からスタートという流れですね。

275 struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
276                                            struct kmem_cache *root_cache,
277                                            const char *memcg_name)
278 {
279         struct kmem_cache *s = NULL;
280         char *cache_name;
281 
282         get_online_cpus();
283         get_online_mems();
284 
285         mutex_lock(&slab_mutex);
286 
287         cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
288                                memcg_cache_id(memcg), memcg_name);
289         if (!cache_name)
290                 goto out_unlock;
291 
292         s = do_kmem_cache_create(cache_name, root_cache->object_size,
293                                  root_cache->size, root_cache->align,
294                                  root_cache->flags, root_cache->ctor,
295                                  memcg, root_cache);
296         if (IS_ERR(s)) {
297                 kfree(cache_name);
298                 s = NULL;
299         }
300 
301 out_unlock:
302         mutex_unlock(&slab_mutex);
303 
304         put_online_mems();
305         put_online_cpus();
306 
307         return s;
308 }