Commit 8d319094 authored by J. R. Okajima's avatar J. R. Okajima
Browse files

aufs: file op, mmap



For details, read the document in this commit.
I don't like this approach, but there is no other way currently. But it
seems that UnionMount is trying add siblings of f_dentry and d_inode for
linux-4.0 or later. It may become another light for aufs too.

The finfo object which has ever mmapped is excluded from
refreshing (based upon fi_mmapped). Otherwise we may corrupt the process
memory space.
Signed-off-by: default avatarJ. R. Okajima <hooanon05g@gmail.com>
parent beeeb01e
# Copyright (C) 2005-2019 Junjiro R. Okajima
mmap(2) -- File Memory Mapping
----------------------------------------------------------------------
In aufs, the file-mapped pages are handled by a branch fs directly, no
interaction with aufs. It means aufs_mmap() calls the branch fs's
->mmap().
This approach is simple and good, but there is one problem.
Under /proc, several entries show the mmapped files by its path (with
device and inode number), and the printed path will be the path on the
branch fs's instead of virtual aufs's.
This is not a problem in most cases, but some utilities lsof(1) (and its
user) may expect the path on aufs.
To address this issue, aufs adds a new member called vm_prfile in struct
vm_area_struct (and struct vm_region). The original vm_file points to
the file on the branch fs in order to handle everything correctly as
usual. The new vm_prfile points to a virtual file in aufs, and the
show-functions in procfs refers to vm_prfile if it is set.
Also we need to maintain several other places where touching vm_file
such like
- fork()/clone() copies vma and the reference count of vm_file is
incremented.
- merging vma maintains the ref count too.
This is not a good approach. It just fakes the printed path. But it
leaves all behaviour around f_mapping unchanged. This is surely an
advantage.
Actually aufs had adopted another complicated approach which calls
generic_file_mmap() and handles struct vm_operations_struct. In this
approach, aufs met a hard problem and I could not solve it without
switching the approach.
There may be one more another approach which is
- bind-mount the branch-root onto the aufs-root internally
- grab the new vfsmount (ie. struct mount)
- lazy-umount the branch-root internally
- in open(2) the aufs-file, open the branch-file with the hidden
vfsmount (instead of the original branch's vfsmount)
- ideally this "bind-mount and lazy-umount" should be done atomically,
but it may be possible from userspace by the mount helper.
Adding the internal hidden vfsmount and using it in opening a file, the
file path under /proc will be printed correctly. This approach looks
smarter, but is not possible I am afraid.
- aufs-root may be bind-mount later. when it happens, another hidden
vfsmount will be required.
- it is hard to get the chance to bind-mount and lazy-umount
+ in kernel-space, FS can have vfsmount in open(2) via
file->f_path, and aufs can know its vfsmount. But several locks are
already acquired, and if aufs tries to bind-mount and lazy-umount
here, then it may cause a deadlock.
+ in user-space, bind-mount doesn't invoke the mount helper.
- since /proc shows dev and ino, aufs has to give vma these info. it
means a new member vm_prinode will be necessary. this is essentially
equivalent to vm_prfile described above.
I have to give up this "looks-smater" approach.
......@@ -233,8 +233,8 @@ static int do_pri_file(aufs_bindex_t bindex, struct file *file)
&& !IS_ERR_OR_NULL(file->f_path.dentry)
&& au_test_aufs(file->f_path.dentry->d_sb)
&& au_fi(file))
snprintf(a, sizeof(a), ", gen %d",
au_figen(file));
snprintf(a, sizeof(a), ", gen %d, mmapped %d",
au_figen(file), atomic_read(&au_fi(file)->fi_mmapped));
dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n",
bindex, file->f_mode, file->f_flags, (long)file_count(file),
file->f_version, file->f_pos, a);
......
......@@ -7,6 +7,8 @@
* file and vm operations
*/
#include <linux/fs_stack.h>
#include <linux/mman.h>
#include "aufs.h"
int au_do_open_nondir(struct file *file, int flags)
......@@ -24,6 +26,7 @@ int au_do_open_nondir(struct file *file, int flags)
dentry = file->f_path.dentry;
finfo = au_fi(file);
memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
atomic_set(&finfo->fi_mmapped, 0);
bindex = au_dbtop(dentry);
h_file = au_h_open(dentry, bindex, flags, file);
if (IS_ERR(h_file))
......@@ -81,9 +84,180 @@ int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)
/* ---------------------------------------------------------------------- */
struct au_write_pre {
/* input */
unsigned int lsc;
/* output */
blkcnt_t blks;
aufs_bindex_t btop;
};
/*
* return with iinfo is write-locked
* callers should call au_write_post() or iinfo_write_unlock() + fput() in the
* end
*/
static struct file *au_write_pre(struct file *file, int do_ready,
struct au_write_pre *wpre)
{
struct file *h_file;
struct dentry *dentry;
int err;
unsigned int lsc;
struct au_pin pin;
lsc = 0;
if (wpre)
lsc = wpre->lsc;
err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1, lsc);
h_file = ERR_PTR(err);
if (unlikely(err))
goto out;
dentry = file->f_path.dentry;
if (do_ready) {
err = au_ready_to_write(file, -1, &pin);
if (unlikely(err)) {
h_file = ERR_PTR(err);
di_write_unlock(dentry);
goto out_fi;
}
}
di_downgrade_lock(dentry, /*flags*/0);
if (wpre)
wpre->btop = au_fbtop(file);
h_file = au_hf_top(file);
get_file(h_file);
if (wpre)
wpre->blks = file_inode(h_file)->i_blocks;
if (do_ready)
au_unpin(&pin);
di_read_unlock(dentry, /*flags*/0);
out_fi:
fi_write_unlock(file);
out:
return h_file;
}
/*
* The locking order around current->mmap_sem.
* - in most and regular cases
* file I/O syscall -- aufs_read() or something
* -- si_rwsem for read -- mmap_sem
* (Note that [fdi]i_rwsem are released before mmap_sem).
* - in mmap case
* mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
* This AB-BA order is definitely bad, but is not a problem since "si_rwsem for
* read" allows multiple processes to acquire it and [fdi]i_rwsem are not held
* in file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
* It means that when aufs acquires si_rwsem for write, the process should never
* acquire mmap_sem.
*
* Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
* problem either since any directory is not able to be mmap-ed.
* The similar scenario is applied to aufs_readlink() too.
*/
#if 0 /* stop calling security_file_mmap() */
/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
#define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b)
static unsigned long au_arch_prot_conv(unsigned long flags)
{
/* currently ppc64 only */
#ifdef CONFIG_PPC64
/* cf. linux/arch/powerpc/include/asm/mman.h */
AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
return AuConv_VM_PROT(flags, SAO);
#else
AuDebugOn(arch_calc_vm_prot_bits(-1));
return 0;
#endif
}
static unsigned long au_prot_conv(unsigned long flags)
{
return AuConv_VM_PROT(flags, READ)
| AuConv_VM_PROT(flags, WRITE)
| AuConv_VM_PROT(flags, EXEC)
| au_arch_prot_conv(flags);
}
/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
#define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b)
static unsigned long au_flag_conv(unsigned long flags)
{
return AuConv_VM_MAP(flags, GROWSDOWN)
| AuConv_VM_MAP(flags, DENYWRITE)
| AuConv_VM_MAP(flags, LOCKED);
}
#endif
static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
{
int err;
const unsigned char wlock
= (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
struct super_block *sb;
struct file *h_file;
struct inode *inode;
AuDbgVmRegion(file, vma);
inode = file_inode(file);
sb = inode->i_sb;
lockdep_off();
si_read_lock(sb, AuLock_NOPLMW);
h_file = au_write_pre(file, wlock, /*wpre*/NULL);
lockdep_on();
err = PTR_ERR(h_file);
if (IS_ERR(h_file))
goto out;
err = 0;
au_set_mmapped(file);
au_vm_file_reset(vma, h_file);
/*
* we cannot call security_mmap_file() here since it may acquire
* mmap_sem or i_mutex.
*
* err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
* au_flag_conv(vma->vm_flags));
*/
if (!err)
err = call_mmap(h_file, vma);
if (!err) {
au_vm_prfile_set(vma, file);
fsstack_copy_attr_atime(inode, file_inode(h_file));
goto out_fput; /* success */
}
au_unset_mmapped(file);
au_vm_file_reset(vma, file);
out_fput:
lockdep_off();
ii_write_unlock(inode);
lockdep_on();
fput(h_file);
out:
lockdep_off();
si_read_unlock(sb);
lockdep_on();
AuTraceErr(err);
return err;
}
/* ---------------------------------------------------------------------- */
const struct file_operations aufs_file_fop = {
.owner = THIS_MODULE,
.mmap = aufs_mmap,
.open = aufs_open_nondir,
.release = aufs_release_nondir
};
......@@ -431,7 +431,7 @@ static void au_do_refresh_dir(struct file *file)
}
p = fidir->fd_hfile;
if (!d_unlinked(file->f_path.dentry)) {
if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) {
bbot = au_sbbot(sb);
for (finfo->fi_btop = 0; finfo->fi_btop <= bbot;
finfo->fi_btop++, p++)
......@@ -492,7 +492,8 @@ static int refresh_file(struct file *file, int (*reopen)(struct file *file))
err = 0;
need_reopen = 1;
err = au_file_refresh_by_inode(file, &need_reopen);
if (!au_test_mmapped(file))
err = au_file_refresh_by_inode(file, &need_reopen);
if (finfo->fi_hdir)
/* harmless if err */
au_fidir_realloc(finfo, nbr, /*may_shrink*/1);
......
......@@ -14,6 +14,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm_types.h>
#include "rwsem.h"
struct au_branch;
......@@ -43,7 +44,10 @@ struct au_finfo {
aufs_bindex_t fi_btop;
/* do not union them */
struct au_hfile fi_htop; /* for non-dir */
struct { /* for non-dir */
struct au_hfile fi_htop;
atomic_t fi_mmapped;
};
struct au_fidir *fi_hdir; /* for dir only */
struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
......@@ -216,5 +220,74 @@ static inline unsigned int au_figen(struct file *f)
return atomic_read(&au_fi(f)->fi_generation);
}
static inline void au_set_mmapped(struct file *f)
{
if (atomic_inc_return(&au_fi(f)->fi_mmapped))
return;
pr_warn("fi_mmapped wrapped around\n");
while (!atomic_inc_return(&au_fi(f)->fi_mmapped))
;
}
static inline void au_unset_mmapped(struct file *f)
{
atomic_dec(&au_fi(f)->fi_mmapped);
}
static inline int au_test_mmapped(struct file *f)
{
return atomic_read(&au_fi(f)->fi_mmapped);
}
/* customize vma->vm_file */
static inline void au_do_vm_file_reset(struct vm_area_struct *vma,
struct file *file)
{
struct file *f;
f = vma->vm_file;
get_file(file);
vma->vm_file = file;
fput(f);
}
#ifdef CONFIG_MMU
#define AuDbgVmRegion(file, vma) do {} while (0)
static inline void au_vm_file_reset(struct vm_area_struct *vma,
struct file *file)
{
au_do_vm_file_reset(vma, file);
}
#else
#define AuDbgVmRegion(file, vma) \
AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file))
static inline void au_vm_file_reset(struct vm_area_struct *vma,
struct file *file)
{
struct file *f;
au_do_vm_file_reset(vma, file);
f = vma->vm_region->vm_file;
get_file(file);
vma->vm_region->vm_file = file;
fput(f);
}
#endif /* CONFIG_MMU */
/* handle vma->vm_prfile */
static inline void au_vm_prfile_set(struct vm_area_struct *vma,
struct file *file)
{
get_file(file);
vma->vm_prfile = file;
#ifndef CONFIG_MMU
get_file(file);
vma->vm_region->vm_prfile = file;
#endif
}
#endif /* __KERNEL__ */
#endif /* __AUFS_FILE_H__ */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment