Skip to content

Commit b3e19d9

Browse files
author
Nick Piggin
committed
fs: scale mntget/mntput
The problem that this patch aims to fix is vfsmount refcounting scalability. We need to take a reference on the vfsmount for every successful path lookup, which often go to the same mount point. The fundamental difficulty is that a "simple" reference count can never be made scalable, because any time a reference is dropped, we must check whether that was the last reference. To do that requires communication with all other CPUs that may have taken a reference count. We can make refcounts more scalable in a couple of ways, involving keeping distributed counters, and checking for the global-zero condition less frequently. - check the global sum once every interval (this will delay zero detection for some interval, so it's probably a showstopper for vfsmounts). - keep a local count and only taking the global sum when local reaches 0 (this is difficult for vfsmounts, because we can't hold preempt off for the life of a reference, so a counter would need to be per-thread or tied strongly to a particular CPU which requires more locking). - keep a local difference of increments and decrements, which allows us to sum the total difference and hence find the refcount when summing all CPUs. Then, keep a single integer "long" refcount for slow and long lasting references, and only take the global sum of local counters when the long refcount is 0. This last scheme is what I implemented here. Attached mounts and process root and working directory references are "long" references, and everything else is a short reference. This allows scalable vfsmount references during path walking over mounted subtrees and unattached (lazy umounted) mounts with processes still running in them. This results in one fewer atomic op in the fastpath: mntget is now just a per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock and non-atomic decrement in the common case. However code is otherwise bigger and heavier, so single threaded performance is basically a wash. Signed-off-by: Nick Piggin <[email protected]>
1 parent c6653a8 commit b3e19d9

File tree

13 files changed

+283
-98
lines changed

13 files changed

+283
-98
lines changed

arch/ia64/kernel/perfmon.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1542,7 +1542,7 @@ pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
15421542
* any operations on the root directory. However, we need a non-trivial
15431543
* d_name - pfm: will go nicely and kill the special-casing in procfs.
15441544
*/
1545-
static struct vfsmount *pfmfs_mnt;
1545+
static struct vfsmount *pfmfs_mnt __read_mostly;
15461546

15471547
static int __init
15481548
init_pfm_fs(void)

drivers/mtd/mtdchar.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1201,7 +1201,7 @@ static int __init init_mtdchar(void)
12011201
static void __exit cleanup_mtdchar(void)
12021202
{
12031203
unregister_mtd_user(&mtdchar_notifier);
1204-
mntput(mtd_inode_mnt);
1204+
mntput_long(mtd_inode_mnt);
12051205
unregister_filesystem(&mtd_inodefs_type);
12061206
__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
12071207
}

fs/anon_inodes.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ static int __init anon_inode_init(void)
232232
return 0;
233233

234234
err_mntput:
235-
mntput(anon_inode_mnt);
235+
mntput_long(anon_inode_mnt);
236236
err_unregister_filesystem:
237237
unregister_filesystem(&anon_inode_fs_type);
238238
err_exit:

fs/fs_struct.c

+16-10
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
1717
write_seqcount_begin(&fs->seq);
1818
old_root = fs->root;
1919
fs->root = *path;
20-
path_get(path);
20+
path_get_long(path);
2121
write_seqcount_end(&fs->seq);
2222
spin_unlock(&fs->lock);
2323
if (old_root.dentry)
24-
path_put(&old_root);
24+
path_put_long(&old_root);
2525
}
2626

2727
/*
@@ -36,12 +36,12 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
3636
write_seqcount_begin(&fs->seq);
3737
old_pwd = fs->pwd;
3838
fs->pwd = *path;
39-
path_get(path);
39+
path_get_long(path);
4040
write_seqcount_end(&fs->seq);
4141
spin_unlock(&fs->lock);
4242

4343
if (old_pwd.dentry)
44-
path_put(&old_pwd);
44+
path_put_long(&old_pwd);
4545
}
4646

4747
void chroot_fs_refs(struct path *old_root, struct path *new_root)
@@ -59,13 +59,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
5959
write_seqcount_begin(&fs->seq);
6060
if (fs->root.dentry == old_root->dentry
6161
&& fs->root.mnt == old_root->mnt) {
62-
path_get(new_root);
62+
path_get_long(new_root);
6363
fs->root = *new_root;
6464
count++;
6565
}
6666
if (fs->pwd.dentry == old_root->dentry
6767
&& fs->pwd.mnt == old_root->mnt) {
68-
path_get(new_root);
68+
path_get_long(new_root);
6969
fs->pwd = *new_root;
7070
count++;
7171
}
@@ -76,13 +76,13 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
7676
} while_each_thread(g, p);
7777
read_unlock(&tasklist_lock);
7878
while (count--)
79-
path_put(old_root);
79+
path_put_long(old_root);
8080
}
8181

8282
void free_fs_struct(struct fs_struct *fs)
8383
{
84-
path_put(&fs->root);
85-
path_put(&fs->pwd);
84+
path_put_long(&fs->root);
85+
path_put_long(&fs->pwd);
8686
kmem_cache_free(fs_cachep, fs);
8787
}
8888

@@ -115,7 +115,13 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
115115
spin_lock_init(&fs->lock);
116116
seqcount_init(&fs->seq);
117117
fs->umask = old->umask;
118-
get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
118+
119+
spin_lock(&old->lock);
120+
fs->root = old->root;
121+
path_get_long(&fs->root);
122+
fs->pwd = old->pwd;
123+
path_get_long(&fs->pwd);
124+
spin_unlock(&old->lock);
119125
}
120126
return fs;
121127
}

fs/internal.h

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ extern int copy_mount_string(const void __user *, char **);
6363

6464
extern void free_vfsmnt(struct vfsmount *);
6565
extern struct vfsmount *alloc_vfsmnt(const char *);
66+
extern unsigned int mnt_get_count(struct vfsmount *mnt);
6667
extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
6768
extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
6869
struct vfsmount *);

fs/namei.c

+24
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,18 @@ void path_get(struct path *path)
367367
}
368368
EXPORT_SYMBOL(path_get);
369369

370+
/**
371+
* path_get_long - get a long reference to a path
372+
* @path: path to get the reference to
373+
*
374+
* Given a path increment the reference count to the dentry and the vfsmount.
375+
*/
376+
void path_get_long(struct path *path)
377+
{
378+
mntget_long(path->mnt);
379+
dget(path->dentry);
380+
}
381+
370382
/**
371383
* path_put - put a reference to a path
372384
* @path: path to put the reference to
@@ -380,6 +392,18 @@ void path_put(struct path *path)
380392
}
381393
EXPORT_SYMBOL(path_put);
382394

395+
/**
396+
* path_put_long - put a long reference to a path
397+
* @path: path to put the reference to
398+
*
399+
* Given a path decrement the reference count to the dentry and the vfsmount.
400+
*/
401+
void path_put_long(struct path *path)
402+
{
403+
dput(path->dentry);
404+
mntput_long(path->mnt);
405+
}
406+
383407
/**
384408
* nameidata_drop_rcu - drop this nameidata out of rcu-walk
385409
* @nd: nameidata pathwalk data to drop

0 commit comments

Comments
 (0)