linux/drivers/block/loop.h
Dan Schatzberg 87579e9b7d loop: use worker per cgroup instead of kworker
Patch series "Charge loop device i/o to issuing cgroup", v14.

The loop device runs all i/o to the backing file on a separate kworker
thread which results in all i/o being charged to the root cgroup.  This
allows a loop device to be used to trivially bypass resource limits and
other policy.  This patch series fixes this gap in accounting.

A simple script to demonstrate this behavior on cgroupv2 machine:

'''
#!/bin/bash
set -e

CGROUP=/sys/fs/cgroup/test.slice
LOOP_DEV=/dev/loop0

if [[ ! -d $CGROUP ]]
then
    sudo mkdir $CGROUP
fi

grep oom_kill $CGROUP/memory.events

# Set a memory limit, write more than that limit to tmpfs -> OOM kill
sudo unshare -m bash -c "
echo \$\$ > $CGROUP/cgroup.procs;
echo 0 > $CGROUP/memory.swap.max;
echo 64M > $CGROUP/memory.max;
mount -t tmpfs -o size=512m tmpfs /tmp;
dd if=/dev/zero of=/tmp/file bs=1M count=256" || true

grep oom_kill $CGROUP/memory.events

# Set a memory limit, write more than that limit through loopback
# device -> no OOM kill
sudo unshare -m bash -c "
echo \$\$ > $CGROUP/cgroup.procs;
echo 0 > $CGROUP/memory.swap.max;
echo 64M > $CGROUP/memory.max;
mount -t tmpfs -o size=512m tmpfs /tmp;
truncate -s 512m /tmp/backing_file
losetup $LOOP_DEV /tmp/backing_file
dd if=/dev/zero of=$LOOP_DEV bs=1M count=256;
losetup -D $LOOP_DEV" || true

grep oom_kill $CGROUP/memory.events
'''

Naively charging cgroups could result in priority inversions through the
single kworker thread in the case where multiple cgroups are
reading/writing to the same loop device.  This patch series does some
minor modification to the loop driver so that each cgroup can make forward
progress independently to avoid this inversion.

With this patch series applied, the above script triggers OOM kills when
writing through the loop device as expected.

This patch (of 3):

Existing uses of loop device may have multiple cgroups reading/writing to
the same device.  Simply charging resources for I/O to the backing file
could result in priority inversion where one cgroup gets synchronously
blocked, holding up all other I/O to the loop device.

In order to avoid this priority inversion, we use a single workqueue where
each work item is a "struct loop_worker" which contains a queue of struct
loop_cmds to issue.  The loop device maintains a tree mapping blk css_id
-> loop_worker.  This allows each cgroup to independently make forward
progress issuing I/O to the backing file.

There is also a single queue for I/O associated with the rootcg which can
be used in cases of extreme memory shortage where we cannot allocate a
loop_worker.

The locking for the tree and queues is fairly heavy handed - we acquire a
per-loop-device spinlock any time either is accessed.  The existing
implementation serializes all I/O through a single thread anyways, so I
don't believe this is any worse.

[colin.king@canonical.com: fixes]

Link: https://lkml.kernel.org/r/20210610173944.1203706-1-schatzberg.dan@gmail.com
Link: https://lkml.kernel.org/r/20210610173944.1203706-2-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Chris Down <chris@chrisdown.name>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-06-29 10:53:50 -07:00

101 lines
2.5 KiB
C

/*
* loop.h
*
* Written by Theodore Ts'o, 3/29/93.
*
* Copyright 1993 by Theodore Ts'o. Redistribution of this file is
* permitted under the GNU General Public License.
*/
#ifndef _LINUX_LOOP_H
#define _LINUX_LOOP_H
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <uapi/linux/loop.h>
/* Possible states of device */
enum {
Lo_unbound,
Lo_bound,
Lo_rundown,
Lo_deleting,
};
struct loop_func_table;
struct loop_device {
int lo_number;
atomic_t lo_refcnt;
loff_t lo_offset;
loff_t lo_sizelimit;
int lo_flags;
int (*transfer)(struct loop_device *, int cmd,
struct page *raw_page, unsigned raw_off,
struct page *loop_page, unsigned loop_off,
int size, sector_t real_block);
char lo_file_name[LO_NAME_SIZE];
char lo_crypt_name[LO_NAME_SIZE];
char lo_encrypt_key[LO_KEY_SIZE];
int lo_encrypt_key_size;
struct loop_func_table *lo_encryption;
__u32 lo_init[2];
kuid_t lo_key_owner; /* Who set the key */
int (*ioctl)(struct loop_device *, int cmd,
unsigned long arg);
struct file * lo_backing_file;
struct block_device *lo_device;
void *key_data;
gfp_t old_gfp_mask;
spinlock_t lo_lock;
int lo_state;
spinlock_t lo_work_lock;
struct workqueue_struct *workqueue;
struct work_struct rootcg_work;
struct list_head rootcg_cmd_list;
struct list_head idle_worker_list;
struct rb_root worker_tree;
struct timer_list timer;
bool use_dio;
bool sysfs_inited;
struct request_queue *lo_queue;
struct blk_mq_tag_set tag_set;
struct gendisk *lo_disk;
struct mutex lo_mutex;
};
struct loop_cmd {
struct list_head list_entry;
bool use_aio; /* use AIO interface to handle I/O */
atomic_t ref; /* only for aio */
long ret;
struct kiocb iocb;
struct bio_vec *bvec;
struct cgroup_subsys_state *css;
};
/* Support for loadable transfer modules */
struct loop_func_table {
int number; /* filter type */
int (*transfer)(struct loop_device *lo, int cmd,
struct page *raw_page, unsigned raw_off,
struct page *loop_page, unsigned loop_off,
int size, sector_t real_block);
int (*init)(struct loop_device *, const struct loop_info64 *);
/* release is called from loop_unregister_transfer or clr_fd */
int (*release)(struct loop_device *);
int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
struct module *owner;
};
int loop_register_transfer(struct loop_func_table *funcs);
int loop_unregister_transfer(int number);
#endif