aufs: file op, mmap

For details, read the document in this commit. I don't like this approach, but there is no other way currently. But it seems that UnionMount is trying add siblings of f_dentry and d_inode for linux-4.0 or later. It may become another light for aufs too. The finfo object which has ever mmapped is excluded from refreshing (based upon fi_mmapped). Otherwise we may corrupt the process memory space. Signed-off-by: J. R. Okajima <hooanon05g@gmail.com>

aufs: file op, mmap
For details, read the document in this commit. I don't like this approach, but there is no other way currently. But it seems that UnionMount is trying add siblings of f_dentry and d_inode for linux-4.0 or later. It may become another light for aufs too. The finfo object which has ever mmapped is excluded from refreshing (based upon fi_mmapped). Otherwise we may corrupt the process memory space. Signed-off-by: J. R. Okajima <hooanon05g@gmail.com>
8d319094 · J. R. Okajima · beeeb01e · 8d319094 · 8d319094 · 8d319094
Commit 8d319094 authored Mar 30, 2015 by J. R. Okajima
--- a/Documentation/filesystems/aufs/design/06mmap.txt
+++ b/Documentation/filesystems/aufs/design/06mmap.txt
+
+# Copyright (C) 2005-2019 Junjiro R. Okajima
+
+mmap(2) -- File Memory Mapping
+----------------------------------------------------------------------
+In aufs, the file-mapped pages are handled by a branch fs directly, no
+interaction with aufs. It means aufs_mmap() calls the branch fs's
+->mmap().
+This approach is simple and good, but there is one problem.
+Under /proc, several entries show the mmapped files by its path (with
+device and inode number), and the printed path will be the path on the
+branch fs's instead of virtual aufs's.
+This is not a problem in most cases, but some utilities lsof(1) (and its
+user) may expect the path on aufs.
+
+To address this issue, aufs adds a new member called vm_prfile in struct
+vm_area_struct (and struct vm_region). The original vm_file points to
+the file on the branch fs in order to handle everything correctly as
+usual. The new vm_prfile points to a virtual file in aufs, and the
+show-functions in procfs refers to vm_prfile if it is set.
+Also we need to maintain several other places where touching vm_file
+such like
+- fork()/clone() copies vma and the reference count of vm_file is
+  incremented.
+- merging vma maintains the ref count too.
+
+This is not a good approach. It just fakes the printed path. But it
+leaves all behaviour around f_mapping unchanged. This is surely an
+advantage.
+Actually aufs had adopted another complicated approach which calls
+generic_file_mmap() and handles struct vm_operations_struct. In this
+approach, aufs met a hard problem and I could not solve it without
+switching the approach.
+
+There may be one more another approach which is
+- bind-mount the branch-root onto the aufs-root internally
+- grab the new vfsmount (ie. struct mount)
+- lazy-umount the branch-root internally
+- in open(2) the aufs-file, open the branch-file with the hidden
+  vfsmount (instead of the original branch's vfsmount)
+- ideally this "bind-mount and lazy-umount" should be done atomically,
+  but it may be possible from userspace by the mount helper.
+
+Adding the internal hidden vfsmount and using it in opening a file, the
+file path under /proc will be printed correctly. This approach looks
+smarter, but is not possible I am afraid.
+- aufs-root may be bind-mount later. when it happens, another hidden
+  vfsmount will be required.
+- it is hard to get the chance to bind-mount and lazy-umount
+  + in kernel-space, FS can have vfsmount in open(2) via
+    file->f_path, and aufs can know its vfsmount. But several locks are
+    already acquired, and if aufs tries to bind-mount and lazy-umount
+    here, then it may cause a deadlock.
+  + in user-space, bind-mount doesn't invoke the mount helper.
+- since /proc shows dev and ino, aufs has to give vma these info. it
+  means a new member vm_prinode will be necessary. this is essentially
+  equivalent to vm_prfile described above.
+
+I have to give up this "looks-smater" approach.
--- a/fs/aufs/debug.c
+++ b/fs/aufs/debug.c
@@ -233,8 +233,8 @@ static int do_pri_file(aufs_bindex_t bindex, struct file *file)
 	    && !IS_ERR_OR_NULL(file->f_path.dentry)
 	    && au_test_aufs(file->f_path.dentry->d_sb)
 	    && au_fi(file))
-		snprintf(a, sizeof(a), ", gen %d",
-			 au_figen(file));
+		snprintf(a, sizeof(a), ", gen %d, mmapped %d",
+			 au_figen(file), atomic_read(&au_fi(file)->fi_mmapped));
 	dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n",
 	     bindex, file->f_mode, file->f_flags, (long)file_count(file),
 	     file->f_version, file->f_pos, a);

--- a/fs/aufs/f_op.c
+++ b/fs/aufs/f_op.c
@@ -7,6 +7,8 @@
 * file and vm operations
 */

+#include <linux/fs_stack.h>
+#include <linux/mman.h>
 #include "aufs.h"

 int au_do_open_nondir(struct file *file, int flags)
@@ -24,6 +26,7 @@ int au_do_open_nondir(struct file *file, int flags)
 	dentry = file->f_path.dentry;
 	finfo = au_fi(file);
 	memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
+	atomic_set(&finfo->fi_mmapped, 0);
 	bindex = au_dbtop(dentry);
 	h_file = au_h_open(dentry, bindex, flags, file);
 	if (IS_ERR(h_file))
@@ -81,9 +84,180 @@ int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)

 /* ---------------------------------------------------------------------- */

+struct au_write_pre {
+	/* input */
+	unsigned int lsc;
+
+	/* output */
+	blkcnt_t blks;
+	aufs_bindex_t btop;
+};
+
+/*
+ * return with iinfo is write-locked
+ * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
+ * end
+ */
+static struct file *au_write_pre(struct file *file, int do_ready,
+				 struct au_write_pre *wpre)
+{
+	struct file *h_file;
+	struct dentry *dentry;
+	int err;
+	unsigned int lsc;
+	struct au_pin pin;
+
+	lsc = 0;
+	if (wpre)
+		lsc = wpre->lsc;
+	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1, lsc);
+	h_file = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	dentry = file->f_path.dentry;
+	if (do_ready) {
+		err = au_ready_to_write(file, -1, &pin);
+		if (unlikely(err)) {
+			h_file = ERR_PTR(err);
+			di_write_unlock(dentry);
+			goto out_fi;
+		}
+	}
+
+	di_downgrade_lock(dentry, /*flags*/0);
+	if (wpre)
+		wpre->btop = au_fbtop(file);
+	h_file = au_hf_top(file);
+	get_file(h_file);
+	if (wpre)
+		wpre->blks = file_inode(h_file)->i_blocks;
+	if (do_ready)
+		au_unpin(&pin);
+	di_read_unlock(dentry, /*flags*/0);
+
+out_fi:
+	fi_write_unlock(file);
+out:
+	return h_file;
+}
+
+/*
+ * The locking order around current->mmap_sem.
+ * - in most and regular cases
+ *   file I/O syscall -- aufs_read() or something
+ *	-- si_rwsem for read -- mmap_sem
+ *	(Note that [fdi]i_rwsem are released before mmap_sem).
+ * - in mmap case
+ *   mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
+ * This AB-BA order is definitely bad, but is not a problem since "si_rwsem for
+ * read" allows multiple processes to acquire it and [fdi]i_rwsem are not held
+ * in file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
+ * It means that when aufs acquires si_rwsem for write, the process should never
+ * acquire mmap_sem.
+ *
+ * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
+ * problem either since any directory is not able to be mmap-ed.
+ * The similar scenario is applied to aufs_readlink() too.
+ */
+
+#if 0 /* stop calling security_file_mmap() */
+/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
+#define AuConv_VM_PROT(f, b)	_calc_vm_trans(f, VM_##b, PROT_##b)
+
+static unsigned long au_arch_prot_conv(unsigned long flags)
+{
+	/* currently ppc64 only */
+#ifdef CONFIG_PPC64
+	/* cf. linux/arch/powerpc/include/asm/mman.h */
+	AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
+	return AuConv_VM_PROT(flags, SAO);
+#else
+	AuDebugOn(arch_calc_vm_prot_bits(-1));
+	return 0;
+#endif
+}
+
+static unsigned long au_prot_conv(unsigned long flags)
+{
+	return AuConv_VM_PROT(flags, READ)
+		| AuConv_VM_PROT(flags, WRITE)
+		| AuConv_VM_PROT(flags, EXEC)
+		| au_arch_prot_conv(flags);
+}
+
+/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
+#define AuConv_VM_MAP(f, b)	_calc_vm_trans(f, VM_##b, MAP_##b)
+
+static unsigned long au_flag_conv(unsigned long flags)
+{
+	return AuConv_VM_MAP(flags, GROWSDOWN)
+		| AuConv_VM_MAP(flags, DENYWRITE)
+		| AuConv_VM_MAP(flags, LOCKED);
+}
+#endif
+
+static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int err;
+	const unsigned char wlock
+		= (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
+	struct super_block *sb;
+	struct file *h_file;
+	struct inode *inode;
+
+	AuDbgVmRegion(file, vma);
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	lockdep_off();
+	si_read_lock(sb, AuLock_NOPLMW);
+
+	h_file = au_write_pre(file, wlock, /*wpre*/NULL);
+	lockdep_on();
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = 0;
+	au_set_mmapped(file);
+	au_vm_file_reset(vma, h_file);
+	/*
+	 * we cannot call security_mmap_file() here since it may acquire
+	 * mmap_sem or i_mutex.
+	 *
+	 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
+	 *			 au_flag_conv(vma->vm_flags));
+	 */
+	if (!err)
+		err = call_mmap(h_file, vma);
+	if (!err) {
+		au_vm_prfile_set(vma, file);
+		fsstack_copy_attr_atime(inode, file_inode(h_file));
+		goto out_fput; /* success */
+	}
+	au_unset_mmapped(file);
+	au_vm_file_reset(vma, file);
+
+out_fput:
+	lockdep_off();
+	ii_write_unlock(inode);
+	lockdep_on();
+	fput(h_file);
+out:
+	lockdep_off();
+	si_read_unlock(sb);
+	lockdep_on();
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
 const struct file_operations aufs_file_fop = {
 	.owner		= THIS_MODULE,

+	.mmap		= aufs_mmap,
 	.open		= aufs_open_nondir,
 	.release	= aufs_release_nondir
 };
--- a/fs/aufs/file.c
+++ b/fs/aufs/file.c
@@ -431,7 +431,7 @@ static void au_do_refresh_dir(struct file *file)
 	}

 	p = fidir->fd_hfile;
-	if (!d_unlinked(file->f_path.dentry)) {
+	if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) {
 		bbot = au_sbbot(sb);
 		for (finfo->fi_btop = 0; finfo->fi_btop <= bbot;
 		     finfo->fi_btop++, p++)
@@ -492,7 +492,8 @@ static int refresh_file(struct file *file, int (*reopen)(struct file *file))

 	err = 0;
 	need_reopen = 1;
-	err = au_file_refresh_by_inode(file, &need_reopen);
+	if (!au_test_mmapped(file))
+		err = au_file_refresh_by_inode(file, &need_reopen);
 	if (finfo->fi_hdir)
 		/* harmless if err */
 		au_fidir_realloc(finfo, nbr, /*may_shrink*/1);

--- a/fs/aufs/file.h
+++ b/fs/aufs/file.h
@@ -14,6 +14,7 @@

 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/mm_types.h>
 #include "rwsem.h"

 struct au_branch;
@@ -43,7 +44,10 @@ struct au_finfo {
 	aufs_bindex_t		fi_btop;

 	/* do not union them */
-	struct au_hfile		fi_htop;	/* for non-dir */
+	struct {				/* for non-dir */
+		struct au_hfile			fi_htop;
+		atomic_t			fi_mmapped;
+	};
 	struct au_fidir		*fi_hdir;	/* for dir only */
 	struct rcu_head		rcu;
 } ____cacheline_aligned_in_smp;
@@ -216,5 +220,74 @@ static inline unsigned int au_figen(struct file *f)
 	return atomic_read(&au_fi(f)->fi_generation);
 }

+static inline void au_set_mmapped(struct file *f)
+{
+	if (atomic_inc_return(&au_fi(f)->fi_mmapped))
+		return;
+	pr_warn("fi_mmapped wrapped around\n");
+	while (!atomic_inc_return(&au_fi(f)->fi_mmapped))
+		;
+}
+
+static inline void au_unset_mmapped(struct file *f)
+{
+	atomic_dec(&au_fi(f)->fi_mmapped);
+}
+
+static inline int au_test_mmapped(struct file *f)
+{
+	return atomic_read(&au_fi(f)->fi_mmapped);
+}
+
+/* customize vma->vm_file */
+
+static inline void au_do_vm_file_reset(struct vm_area_struct *vma,
+				       struct file *file)
+{
+	struct file *f;
+
+	f = vma->vm_file;
+	get_file(file);
+	vma->vm_file = file;
+	fput(f);
+}
+
+#ifdef CONFIG_MMU
+#define AuDbgVmRegion(file, vma) do {} while (0)
+
+static inline void au_vm_file_reset(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	au_do_vm_file_reset(vma, file);
+}
+#else
+#define AuDbgVmRegion(file, vma) \
+	AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file))
+
+static inline void au_vm_file_reset(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	struct file *f;
+
+	au_do_vm_file_reset(vma, file);
+	f = vma->vm_region->vm_file;
+	get_file(file);
+	vma->vm_region->vm_file = file;
+	fput(f);
+}
+#endif /* CONFIG_MMU */
+
+/* handle vma->vm_prfile */
+static inline void au_vm_prfile_set(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	get_file(file);
+	vma->vm_prfile = file;
+#ifndef CONFIG_MMU
+	get_file(file);
+	vma->vm_region->vm_prfile = file;
+#endif
+}
+
 #endif /* __KERNEL__ */
 #endif /* __AUFS_FILE_H__ */