Commit 47118316 authored by J. R. Okajima's avatar J. R. Okajima
Browse files

aufs: virtual or vertical directory 1/2, intro



This commit is just to prepare for the succeeding commit, and split to
suppress the size of a single commit.
Signed-off-by: default avatarJ. R. Okajima <hooanon05g@gmail.com>
parent 4dcf89d5
......@@ -128,6 +128,36 @@ As well as XINO files, aufs has a feature to truncate/refresh XIB to
reduce the number of consumed disk blocks for these files.
Virtual or Vertical Dir, and Readdir in Userspace
----------------------------------------------------------------------
In order to support multiple layers (branches), aufs readdir operation
constructs a virtual dir block on memory. For readdir, aufs calls
vfs_readdir() internally for each dir on branches, merges their entries
with eliminating the whiteout-ed ones, and sets it to file (dir)
object. So the file object has its entry list until it is closed. The
entry list will be updated when the file position is zero and becomes
obsoleted. This decision is made in aufs automatically.
The dynamically allocated memory block for the name of entries has a
unit of 512 bytes (by default) and stores the names contiguously (no
padding). Another block for each entry is handled by kmem_cache too.
During building dir blocks, aufs creates hash list and judging whether
the entry is whiteouted by its upper branch or already listed.
The merged result is cached in the corresponding inode object and
maintained by a customizable life-time option.
Some people may call it can be a security hole or invite DoS attack
since the opened and once readdir-ed dir (file object) holds its entry
list and becomes a pressure for system memory. But I'd say it is similar
to files under /proc or /sys. The virtual files in them also holds a
memory page (generally) while they are opened. When an idea to reduce
memory for them is introduced, it will be applied to aufs too.
For those who really hate this situation, I've developed readdir(3)
library which operates this merging in userspace. You just need to set
LD_PRELOAD environment variable, and aufs will not consume no memory in
kernel space for readdir(3).
Workqueue
----------------------------------------------------------------------
Aufs sometimes requires privilege access to a branch. For instance,
......
......@@ -16,7 +16,7 @@ aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \
dinfo.o dentry.o \
dynop.o \
finfo.o file.o \
dir.o \
dir.o vdir.o \
iinfo.o inode.o i_op.o
# all are boolean
......
......@@ -55,6 +55,45 @@ char *au_plevel = KERN_DEBUG;
/* ---------------------------------------------------------------------- */
void au_dpri_whlist(struct au_nhash *whlist)
{
unsigned long ul, n;
struct hlist_head *head;
struct au_vdir_wh *pos;
n = whlist->nh_num;
head = whlist->nh_head;
for (ul = 0; ul < n; ul++) {
hlist_for_each_entry(pos, head, wh_hash)
dpri("b%d, %.*s, %d\n",
pos->wh_bindex,
pos->wh_str.len, pos->wh_str.name,
pos->wh_str.len);
head++;
}
}
void au_dpri_vdir(struct au_vdir *vdir)
{
unsigned long ul;
union au_vdir_deblk_p p;
unsigned char *o;
if (!vdir || IS_ERR(vdir)) {
dpri("err %ld\n", PTR_ERR(vdir));
return;
}
dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %llu\n",
vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk,
vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version);
for (ul = 0; ul < vdir->vd_nblk; ul++) {
p.deblk = vdir->vd_deblk[ul];
o = p.deblk;
dpri("[%lu]: %p\n", ul, o);
}
}
static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn,
struct dentry *wh)
{
......@@ -206,6 +245,9 @@ static int do_pri_file(aufs_bindex_t bindex, struct file *file)
void au_dpri_file(struct file *file)
{
struct au_finfo *finfo;
struct au_fidir *fidir;
struct au_hfile *hfile;
aufs_bindex_t bindex;
int err;
err = do_pri_file(-1, file);
......@@ -219,7 +261,16 @@ void au_dpri_file(struct file *file)
return;
if (finfo->fi_btop < 0)
return;
do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file);
fidir = finfo->fi_hdir;
if (!fidir)
do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file);
else
for (bindex = finfo->fi_btop;
bindex >= 0 && bindex <= fidir->fd_bbot;
bindex++) {
hfile = fidir->fd_hfile + bindex;
do_pri_file(bindex, hfile ? hfile->hf_file : NULL);
}
}
static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br)
......
......@@ -97,6 +97,10 @@ struct dentry;
#ifdef CONFIG_AUFS_DEBUG
extern struct mutex au_dbg_mtx;
extern char *au_plevel;
struct au_nhash;
void au_dpri_whlist(struct au_nhash *whlist);
struct au_vdir;
void au_dpri_vdir(struct au_vdir *vdir);
struct inode;
void au_dpri_inode(struct inode *inode);
void au_dpri_dalias(struct inode *inode);
......@@ -111,6 +115,20 @@ void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line);
void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen);
void au_dbg_verify_kthread(void);
#define AuDbgWhlist(w) do { \
mutex_lock(&au_dbg_mtx); \
AuDbg(#w "\n"); \
au_dpri_whlist(w); \
mutex_unlock(&au_dbg_mtx); \
} while (0)
#define AuDbgVdir(v) do { \
mutex_lock(&au_dbg_mtx); \
AuDbg(#v "\n"); \
au_dpri_vdir(v); \
mutex_unlock(&au_dbg_mtx); \
} while (0)
#define AuDbgInode(i) do { \
mutex_lock(&au_dbg_mtx); \
AuDbg(#i "\n"); \
......@@ -156,6 +174,8 @@ AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry)
AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen)
AuStubVoid(au_dbg_verify_kthread, void)
#define AuDbgWhlist(w) do {} while (0)
#define AuDbgVdir(v) do {} while (0)
#define AuDbgInode(i) do {} while (0)
#define AuDbgDAlias(i) do {} while (0)
#define AuDbgDentry(d) do {} while (0)
......
......@@ -39,6 +39,49 @@ void au_sub_nlink(struct inode *dir, struct inode *h_dir)
set_nlink(dir, nlink);
}
loff_t au_dir_size(struct file *file, struct dentry *dentry)
{
loff_t sz;
aufs_bindex_t bindex, bbot;
struct file *h_file;
struct dentry *h_dentry;
sz = 0;
if (file) {
AuDebugOn(!d_is_dir(file->f_path.dentry));
bbot = au_fbbot_dir(file);
for (bindex = au_fbtop(file);
bindex <= bbot && sz < KMALLOC_MAX_SIZE;
bindex++) {
h_file = au_hf_dir(file, bindex);
if (h_file && file_inode(h_file))
sz += vfsub_f_size_read(h_file);
}
} else {
AuDebugOn(!dentry);
AuDebugOn(!d_is_dir(dentry));
bbot = au_dbtaildir(dentry);
for (bindex = au_dbtop(dentry);
bindex <= bbot && sz < KMALLOC_MAX_SIZE;
bindex++) {
h_dentry = au_h_dptr(dentry, bindex);
if (h_dentry && d_is_positive(h_dentry))
sz += i_size_read(d_inode(h_dentry));
}
}
if (sz < KMALLOC_MAX_SIZE)
sz = roundup_pow_of_two(sz);
if (sz > KMALLOC_MAX_SIZE)
sz = KMALLOC_MAX_SIZE;
else if (sz < NAME_MAX) {
BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX);
sz = AUFS_RDBLK_DEF;
}
return sz;
}
struct au_dir_ts_arg {
struct dentry *dentry;
aufs_bindex_t brid;
......
......@@ -16,10 +16,67 @@
/* ---------------------------------------------------------------------- */
/* need to be faster and smaller */
struct au_nhash {
unsigned int nh_num;
struct hlist_head *nh_head;
};
struct au_vdir_destr {
unsigned char len;
unsigned char name[0];
} __packed;
struct au_vdir_dehstr {
struct hlist_node hash;
struct au_vdir_destr *str;
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
struct au_vdir_de {
ino_t de_ino;
unsigned char de_type;
/* caution: packed */
struct au_vdir_destr de_str;
} __packed;
struct au_vdir_wh {
struct hlist_node wh_hash;
aufs_bindex_t wh_bindex;
/* caution: packed */
struct au_vdir_destr wh_str;
} __packed;
union au_vdir_deblk_p {
unsigned char *deblk;
struct au_vdir_de *de;
};
struct au_vdir {
unsigned char **vd_deblk;
unsigned long vd_nblk;
struct {
unsigned long ul;
union au_vdir_deblk_p p;
} vd_last;
u64 vd_version;
unsigned int vd_deblk_sz;
unsigned long vd_jiffy;
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
/* ---------------------------------------------------------------------- */
/* dir.c */
void au_add_nlink(struct inode *dir, struct inode *h_dir);
void au_sub_nlink(struct inode *dir, struct inode *h_dir);
loff_t au_dir_size(struct file *file, struct dentry *dentry);
void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc);
/* vdir.c */
void au_vdir_free(struct au_vdir *vdir);
#endif /* __KERNEL__ */
#endif /* __AUFS_DIR_H__ */
......@@ -198,6 +198,7 @@ int au_iinfo_init(struct inode *inode)
iinfo->ii_generation.ig_generation = au_sigen(sb);
iinfo->ii_btop = -1;
iinfo->ii_bbot = -1;
iinfo->ii_vdir = NULL;
return 0;
}
return -ENOMEM;
......@@ -251,6 +252,9 @@ void au_iinfo_fin(struct inode *inode)
}
iinfo = au_ii(inode);
if (iinfo->ii_vdir)
au_vdir_free(iinfo->ii_vdir);
bindex = iinfo->ii_btop;
if (bindex >= 0) {
hi = au_hinode(iinfo, bindex);
......
......@@ -52,6 +52,7 @@ struct au_iigen {
__u32 ig_generation, ig_flags;
};
struct au_vdir;
struct au_iinfo {
struct au_iigen ii_generation;
struct super_block *ii_hsb1; /* no get/put */
......@@ -60,6 +61,7 @@ struct au_iinfo {
aufs_bindex_t ii_btop, ii_bbot;
__u32 ii_higen;
struct au_hinode *ii_hinode;
struct au_vdir *ii_vdir;
};
struct au_icntnr {
......@@ -349,6 +351,12 @@ static inline aufs_bindex_t au_ibbot(struct inode *inode)
return au_ii(inode)->ii_bbot;
}
static inline struct au_vdir *au_ivdir(struct inode *inode)
{
IiMustAnyLock(inode);
return au_ii(inode)->ii_vdir;
}
static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex)
{
IiMustAnyLock(inode);
......@@ -367,6 +375,12 @@ static inline void au_set_ibbot(struct inode *inode, aufs_bindex_t bindex)
au_ii(inode)->ii_bbot = bindex;
}
static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir)
{
IiMustWriteLock(inode);
au_ii(inode)->ii_vdir = vdir;
}
static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex)
{
IiMustAnyLock(inode);
......
......@@ -101,6 +101,10 @@ static int __init au_cache_init(void)
au_cache[AuCache_FINFO] = AuCacheCtor(au_finfo,
au_fi_init_once);
if (au_cache[AuCache_FINFO])
au_cache[AuCache_VDIR] = AuCache(au_vdir);
if (au_cache[AuCache_VDIR])
au_cache[AuCache_DEHSTR] = AuCache(au_vdir_dehstr);
if (au_cache[AuCache_DEHSTR])
return 0;
au_cache_fin();
......
......@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include "debug.h"
#include "dentry.h"
#include "dir.h"
#include "file.h"
#include "inode.h"
......@@ -106,6 +107,8 @@ enum {
AuCache_DINFO,
AuCache_ICNTNR,
AuCache_FINFO,
AuCache_VDIR,
AuCache_DEHSTR,
AuCache_HNOTIFY, /* must be last */
AuCache_Last
};
......@@ -139,6 +142,8 @@ extern struct kmem_cache *au_cache[AuCache_Last];
AuCacheFuncs(dinfo, DINFO);
AuCacheFuncs(icntnr, ICNTNR);
AuCacheFuncs(finfo, FINFO);
AuCacheFuncs(vdir, VDIR);
AuCacheFuncs(vdir_dehstr, DEHSTR);
#ifdef CONFIG_AUFS_HNOTIFY
AuCacheFuncs(hnotify, HNOTIFY);
#endif
......
......@@ -17,6 +17,8 @@
enum {
Opt_br,
Opt_add,
Opt_rdcache, Opt_rdblk, Opt_rdhash,
Opt_rdblk_def, Opt_rdhash_def,
Opt_xino, Opt_noxino,
Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino,
Opt_trunc_xino_path, Opt_itrunc_xino,
......@@ -60,6 +62,12 @@ static match_table_t options = {
{Opt_dio, "dio"},
{Opt_nodio, "nodio"},
{Opt_rdcache, "rdcache=%d"},
{Opt_rdblk, "rdblk=%d"},
{Opt_rdblk_def, "rdblk=def"},
{Opt_rdhash, "rdhash=%d"},
{Opt_rdhash_def, "rdhash=def"},
{Opt_wbr_create, "create=%s"},
{Opt_wbr_create, "create_policy=%s"},
{Opt_wbr_copyup, "cpup=%s"},
......@@ -436,6 +444,21 @@ static void dump_opts(struct au_opts *opts)
u.add->bindex, u.add->pathname, u.add->perm,
u.add->path.dentry);
break;
case Opt_rdcache:
AuDbg("rdcache %d\n", opt->rdcache);
break;
case Opt_rdblk:
AuDbg("rdblk %u\n", opt->rdblk);
break;
case Opt_rdblk_def:
AuDbg("rdblk_def\n");
break;
case Opt_rdhash:
AuDbg("rdhash %u\n", opt->rdhash);
break;
case Opt_rdhash_def:
AuDbg("rdhash_def\n");
break;
case Opt_xino:
u.xino = &opt->xino;
AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file);
......@@ -723,6 +746,49 @@ int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts)
opt->type = token;
break;
case Opt_rdcache:
if (unlikely(match_int(&a->args[0], &n))) {
pr_err("bad integer in %s\n", opt_str);
break;
}
if (unlikely(n > AUFS_RDCACHE_MAX)) {
pr_err("rdcache must be smaller than %d\n",
AUFS_RDCACHE_MAX);
break;
}
opt->rdcache = n;
err = 0;
opt->type = token;
break;
case Opt_rdblk:
if (unlikely(match_int(&a->args[0], &n)
|| n < 0
|| n > KMALLOC_MAX_SIZE)) {
pr_err("bad integer in %s\n", opt_str);
break;
}
if (unlikely(n && n < NAME_MAX)) {
pr_err("rdblk must be larger than %d\n",
NAME_MAX);
break;
}
opt->rdblk = n;
err = 0;
opt->type = token;
break;
case Opt_rdhash:
if (unlikely(match_int(&a->args[0], &n)
|| n < 0
|| n * sizeof(struct hlist_head)
> KMALLOC_MAX_SIZE)) {
pr_err("bad integer in %s\n", opt_str);
break;
}
opt->rdhash = n;
err = 0;
opt->type = token;
break;
case Opt_trunc_xino:
case Opt_notrunc_xino:
case Opt_noxino:
......@@ -733,6 +799,8 @@ int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts)
case Opt_list_plink:
case Opt_dio:
case Opt_nodio:
case Opt_rdblk_def:
case Opt_rdhash_def:
err = 0;
opt->type = token;
break;
......@@ -891,6 +959,23 @@ static int au_opt_simple(struct super_block *sb, struct au_opt *opt,
sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup;
break;
case Opt_rdcache:
sbinfo->si_rdcache
= msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC);
break;
case Opt_rdblk:
sbinfo->si_rdblk = opt->rdblk;
break;
case Opt_rdblk_def:
sbinfo->si_rdblk = AUFS_RDBLK_DEF;
break;
case Opt_rdhash:
sbinfo->si_rdhash = opt->rdhash;
break;
case Opt_rdhash_def:
sbinfo->si_rdhash = AUFS_RDHASH_DEF;
break;
case Opt_trunc_xino:
au_opt_set(sbinfo->si_mntflags, TRUNC_XINO);
break;
......
......@@ -120,6 +120,9 @@ struct au_opt {
struct au_opt_xino xino;
struct au_opt_xino_itrunc xino_itrunc;
struct au_opt_add add;
int rdcache;
unsigned int rdblk;
unsigned int rdhash;
int udba;
struct au_opt_wbr_create wbr_create;
int wbr_copyup;
......
......@@ -73,6 +73,10 @@ int au_si_alloc(struct super_block *sb)
mutex_init(&sbinfo->si_xib_mtx);
/* leave si_xib_last_pindex and si_xib_next_bit */
sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC);
sbinfo->si_rdblk = AUFS_RDBLK_DEF;
sbinfo->si_rdhash = AUFS_RDHASH_DEF;
for (i = 0; i < AuPlink_NHASH; i++)
INIT_HLIST_BL_HEAD(sbinfo->si_plink + i);
init_waitqueue_head(&sbinfo->si_plink_wq);
......
......@@ -112,6 +112,11 @@ struct au_sbinfo {
atomic_t si_xigen_next;
#endif
/* vdir parameters */
unsigned long si_rdcache; /* max cache time in jiffies */
unsigned int si_rdblk; /* deblk size */
unsigned int si_rdhash; /* hash size */
/* pseudo_link list */
struct hlist_bl_head si_plink[AuPlink_NHASH];
wait_queue_head_t si_plink_wq;
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2005-2019 Junjiro R. Okajima
*/
/*
* virtual or vertical directory
*/
#include "aufs.h"
void au_vdir_free(struct au_vdir *vdir)
{
unsigned char **deblk;
deblk = vdir->vd_deblk;
while (vdir->vd_nblk--)
au_kfree_try_rcu(*deblk++);
au_kfree_try_rcu(vdir->vd_deblk);
au_cache_free_vdir(vdir);
}
......@@ -73,6 +73,10 @@ typedef int16_t aufs_bindex_t;
#define AUFS_XINO_DEFPATH "/tmp/" AUFS_XINO_FNAME
#define AUFS_XINO_DEF_SEC 30 /* seconds */
#define AUFS_XINO_DEF_TRUNC 45 /* percentage */
#define AUFS_RDCACHE_DEF 10 /* seconds */
#define AUFS_RDCACHE_MAX 3600 /* seconds */
#define AUFS_RDBLK_DEF 512 /* bytes */
#define AUFS_RDHASH_DEF 32
#define AUFS_WKQ_NAME AUFS_NAME "d"
#define AUFS_MFS_DEF_SEC 30 /* seconds */
#define AUFS_MFS_MAX_SEC 3600 /* seconds */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment