Lennert Buytenhek
2011-04-04 10:06:31 UTC
Hi!
Recently, I ran into the problem of needing an efficient way of doing
incremental backups of virtual machine images. I looked around a bit,
finding the patches that people have written to make rsync work on
block devices, but none of what was out there really suited my needs,
so I ended up hacking up something myself.
My first idea was to write a FUSE module to interpose between a block
device and a virtual representation of that block device in a FUSE
filesystem, relaying all I/O requests to the underlying block device,
but keeping a separate database of per-sector or per-block (or some
other granularity) mtimes. A separate tool would then compare that
database to that of a remote host, and perform an rsync-like operation
to bring the remote host in sync. While this would work, it seemed
overly complex for the job, and it would end up being a half-assed
reimplementation of rsync plus a filesystem.
So then I figured, since *NIX filesystems already track mtimes on a
per-inode basis, why not just keep the master copy of the block device
data in a set of, say, 1 MiB files, and have a FUSE module emulate a
larger file out of this, redirecting the I/Os to the component files?
This way, you don't need to track mtimes yourself (the underlying
filesystem will do that for you), and you can use standard tools like
rsync to synchronise the component files to another host.
The attached FUSE module implements this idea. To get started, do
something like this
cd /some/where
mkdir parts
mkdir parts/volume
for part in `seq 0 99`
do
dd if=/dev/zero of=parts/volume/$data bs=1024k count=1
done
echo 100 > parts/volume/num_parts
echo 1048576 > parts/volume/part_size
mkdir mount
~/bin/blockfs parts/ mount/
In mount/, a file 'volume' will then appear, which should be 100MiB
in size, and which you can dd over, create filesystems on, loopback
mount, etc. The underlying data for the volume will be stored in
the 100 1 MiB component files in parts/, with reads and writes to
the volume file being redirected to those component files.
When a write happens, blockfs will first read the original data for
that byte range from the component file, and if that matches, it won't
perform the write, to avoid updating the mtime on the component file.
With modern filesystems like ext4 and xfs, having many large files in
a single directory shouldn't incur a performance penalty like it would
on other filesystems, e.g. on ext3 without extents or htree, so this
approach should be okay.
There are some things left to be done:
- There should really be some kind of caching in blockfs -- not for
data, but for things like metadata and component file descriptors.
This is currently not done, as it doesn't seem possible to do things
like setting timers or asking the fuse main loop to poll on some
file descriptors for you (e.g. inotify fds for the component directory).
At some point I want to look into integrating the fuse main loop
with ivykis (http://libivykis.sourceforge.net/man3/ivykis.3.html)
to be able to address this.
- There should be an option for suspending I/O while a backup of the
backing store is made, so that you don't end up with a backup copy
of your block device that has had writes to it reordered.
- Establish some kind of rule of thumb for what the optimal component
file size is. This probably depends on hardware specs, method of
synchronisation the data to the slave cop{y,ies}, write pattern and
write intensity, etc, but it should be possible to figure out a sane
default (or a set of defaults) to recommend.
- The name -- 'blockfs' doesn't really cover the functionality of this
module very well. Any better ideas?
Any comments otherwise?
cheers,
Lennert
=== blockfs.c
#define PACKAGE_VERSION "0.1"
#define _GNU_SOURCE
#define FUSE_USE_VERSION 26
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <fuse.h>
#include <pthread.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
static int backing_dir_fd;
static pthread_mutex_t readdir_lock;
static int is_volume_dir(int dirfd)
{
struct stat buf;
if (fstatat(dirfd, "num_parts", &buf, 0) < 0)
return 0;
if (fstatat(dirfd, "part_size", &buf, 0) < 0)
return 0;
return 1;
}
static int open_volume_dir(const char *volume)
{
int dirfd;
dirfd = openat(backing_dir_fd, volume,
O_RDONLY | O_DIRECTORY | O_NOATIME);
if (dirfd < 0)
return -ENOENT;
if (!is_volume_dir(dirfd)) {
close(dirfd);
return -EINVAL;
}
return dirfd;
}
static void
iter_volumes(void *cookie, void (*iter)(void *cookie, char *volume, int dirfd))
{
int fd;
DIR *dirp;
struct dirent entry;
struct dirent *result;
pthread_mutex_lock(&readdir_lock);
fd = dup(backing_dir_fd);
if (fd < 0)
goto out;
dirp = fdopendir(fd);
if (dirp == NULL) {
close(fd);
goto out;
}
rewinddir(dirp);
while (readdir_r(dirp, &entry, &result) == 0) {
int dirfd;
if (result == NULL)
break;
if (result->d_type != DT_DIR && result->d_type != DT_UNKNOWN)
continue;
if (result->d_name[0] == '.')
continue;
dirfd = open_volume_dir(result->d_name);
if (dirfd < 0)
continue;
iter(cookie, result->d_name, dirfd);
close(dirfd);
}
closedir(dirp);
out:
pthread_mutex_unlock(&readdir_lock);
}
static int read_property(int dirfd, char *name, unsigned int *_value)
{
int fd;
FILE *f;
unsigned int value;
fd = openat(dirfd, name, O_RDONLY | O_NOATIME);
if (fd < 0)
return -1;
f = fdopen(fd, "r");
if (f == NULL) {
close(fd);
return -1;
}
if (fscanf(f, "%u", &value) != 1) {
fclose(f);
return -1;
}
*_value = value;
fclose(f);
return 0;
}
static void count_volumes_cb(void *cookie, char *volume, int dirfd)
{
(*((unsigned int *)cookie))++;
}
static int count_volumes(void)
{
unsigned int count = 0;
iter_volumes(&count, count_volumes_cb);
return count;
}
static int mkpartname(char *buf, int buf_size, unsigned int part)
{
return snprintf(buf, buf_size, "%d", part);
}
static int open_part(int dirfd, unsigned int part, int flags)
{
char partname[32];
mkpartname(partname, sizeof(partname), part);
return openat(dirfd, partname, flags | O_NOATIME);
}
struct blockfs_getattr_info {
unsigned int part_size;
int part_size_mismatches;
struct timespec st_atim;
struct timespec st_mtim;
struct timespec st_ctim;
};
static int timespec_cmp(struct timespec *a, struct timespec *b)
{
if (a->tv_sec > b->tv_sec)
return 1;
if (a->tv_sec < b->tv_sec)
return -1;
if (a->tv_nsec > b->tv_nsec)
return 1;
if (a->tv_nsec < b->tv_nsec)
return -1;
return 0;
}
static void blockfs_getattr_cb(void *_info, unsigned int part, struct stat *buf)
{
struct blockfs_getattr_info *info = _info;
if (buf->st_size != info->part_size)
info->part_size_mismatches++;
if (timespec_cmp(&buf->st_atim, &info->st_atim) > 0)
info->st_atim = buf->st_atim;
if (timespec_cmp(&buf->st_mtim, &info->st_mtim) > 0)
info->st_mtim = buf->st_mtim;
if (timespec_cmp(&buf->st_ctim, &info->st_ctim) > 0)
info->st_ctim = buf->st_ctim;
}
static int
iter_parts_stat(int dirfd, unsigned int num_parts, void *cookie,
void (*iter)(void *cookie, unsigned int part, struct stat *buf))
{
int i;
for (i = 0; i < num_parts; i++) {
char partname[32];
struct stat buf;
mkpartname(partname, sizeof(partname), i);
if (fstatat(dirfd, partname, &buf, 0) < 0) {
perror("fstatat");
close(dirfd);
return -1;
}
iter(cookie, i, &buf);
}
return 0;
}
static int blockfs_getattr(const char *path, struct stat *stbuf)
{
struct stat buf;
int dirfd;
unsigned int num_parts;
unsigned int part_size;
struct blockfs_getattr_info info;
if (path[0] == 0)
return -ENOENT;
memset(stbuf, 0, sizeof(struct stat));
if (strcmp(path, "/") == 0) {
int ret;
ret = fstat(backing_dir_fd, &buf);
if (ret < 0)
return -ENOENT;
stbuf->st_mode = buf.st_mode;
stbuf->st_nlink = 2 + count_volumes();
stbuf->st_uid = buf.st_uid;
stbuf->st_gid = buf.st_gid;
stbuf->st_size = 1024;
stbuf->st_blksize = 1024;
stbuf->st_blocks = 1;
stbuf->st_atim = buf.st_atim;
stbuf->st_mtim = buf.st_mtim;
stbuf->st_ctim = buf.st_ctim;
return 0;
}
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
if (fstat(dirfd, &buf) < 0) {
perror("fstat");
close(dirfd);
return -EINVAL;
}
stbuf->st_mode = S_IFREG | (buf.st_mode & 0666);
stbuf->st_nlink = 1;
stbuf->st_uid = buf.st_uid;
stbuf->st_gid = buf.st_gid;
stbuf->st_size = (uint64_t)num_parts * part_size;
stbuf->st_blksize = part_size;
stbuf->st_blocks = ((uint64_t)num_parts * part_size + 511) / 512;
memset(&info, 0, sizeof(info));
info.part_size = part_size;
if (iter_parts_stat(dirfd, num_parts, &info, blockfs_getattr_cb) ||
info.part_size_mismatches) {
close(dirfd);
return -EINVAL;
}
stbuf->st_atim = info.st_atim;
stbuf->st_mtim = info.st_mtim;
stbuf->st_ctim = info.st_ctim;
close(dirfd);
return 0;
}
static int blockfs_truncate(const char *path, off_t length)
{
/* Silently fail. */
return 0;
}
static int blockfs_open(const char *path, struct fuse_file_info *fi)
{
int dirfd;
unsigned int num_parts;
unsigned int part_size;
int mode;
int i;
if (path[0] == 0)
return -ENOENT;
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
mode = ((fi->flags & O_ACCMODE) == O_RDONLY) ? R_OK : W_OK;
for (i = 0; i < num_parts; i++) {
char partname[32];
mkpartname(partname, sizeof(partname), i);
if (faccessat(dirfd, partname, mode, 0) < 0) {
close(dirfd);
return -EPERM;
}
}
close(dirfd);
return 0;
}
static int
part_read(int dirfd, unsigned int part, off_t off, char *buf, int num)
{
int fd;
int ret;
fd = open_part(dirfd, part, O_RDONLY);
if (fd < 0)
return 0;
ret = pread(fd, buf, num, off);
if (ret < 0)
ret = -errno;
close(fd);
return ret;
}
static int blockfs_read(const char *path, char *buf, size_t size,
off_t offset, struct fuse_file_info *fi)
{
int dirfd;
size_t numread;
unsigned int num_parts;
unsigned int part_size;
if (path[0] == 0)
return -ENOENT;
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
numread = 0;
while (size) {
unsigned int part;
off_t part_off;
off_t part_end;
size_t toread;
int ret;
part = offset / part_size;
part_off = offset % part_size;
part_end = (part + 1) * (uint64_t)part_size;
toread = size;
if (offset + toread > part_end)
toread = part_end - offset;
ret = part_read(dirfd, part, part_off, buf, toread);
if (ret < 0) {
if (numread == 0)
numread = ret;
break;
}
buf += ret;
size -= ret;
offset += ret;
numread += ret;
if (ret != toread)
break;
}
close(dirfd);
return numread;
}
static int
part_write(int dirfd, unsigned int part, off_t off, const char *buf, int num)
{
char rbuf[num];
int fd;
int ret;
fd = open_part(dirfd, part, O_RDWR);
if (fd < 0)
return 0;
ret = pread(fd, rbuf, num, off);
if (ret != num || memcmp(buf, rbuf, num)) {
ret = pwrite(fd, buf, num, off);
if (ret < 0)
ret = -errno;
}
close(fd);
return ret;
}
static int blockfs_write(const char *path, const char *buf, size_t size,
off_t offset, struct fuse_file_info *fi)
{
int dirfd;
size_t numwritten;
unsigned int num_parts;
unsigned int part_size;
if (path[0] == 0)
return -ENOENT;
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
numwritten = 0;
while (size) {
unsigned int part;
off_t part_off;
off_t part_end;
size_t towrite;
int ret;
part = offset / part_size;
part_off = offset % part_size;
part_end = (part + 1) * (uint64_t)part_size;
towrite = size;
if (offset + towrite > part_end)
towrite = part_end - offset;
ret = part_write(dirfd, part, part_off, buf, towrite);
if (ret < 0) {
if (numwritten == 0)
numwritten = ret;
break;
}
buf += ret;
size -= ret;
offset += ret;
numwritten += ret;
if (ret != towrite)
break;
}
close(dirfd);
return numwritten;
}
static void
blockfs_statfs_part_cb(void *num, unsigned int part, struct stat *buf)
{
((uint64_t *)num)[1] += buf->st_size;
}
static void blockfs_statfs_volume_cb(void *num, char *volume, int dirfd)
{
unsigned int num_parts;
((uint64_t *)num)[0]++;
if (read_property(dirfd, "num_parts", &num_parts) >= 0)
iter_parts_stat(dirfd, num_parts, num, blockfs_statfs_part_cb);
}
static int blockfs_statfs(const char *path, struct statvfs *buf)
{
uint64_t num[2];
num[0] = 1;
num[1] = 0;
iter_volumes(num, blockfs_statfs_volume_cb);
buf->f_bsize = 4096;
buf->f_frsize = 4096;
buf->f_blocks = num[1] / 4096;
buf->f_bfree = 0;
buf->f_bavail = 0;
buf->f_files = num[0];
buf->f_ffree = 0;
buf->f_favail = 0;
buf->f_fsid = 0;
buf->f_flag = 0;
buf->f_namemax = PATH_MAX;
return 0;
}
static int blockfs_fsync(const char *path, int datasync,
struct fuse_file_info *fi)
{
/* FIXME. */
sync();
return 0;
}
struct blockfs_readdir_info {
void *buf;
fuse_fill_dir_t filler;
};
static void blockfs_readdir_cb(void *_info, char *name, int dirfd)
{
struct blockfs_readdir_info *info = _info;
info->filler(info->buf, name, NULL, 0);
}
static int blockfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
off_t offset, struct fuse_file_info *fi)
{
struct blockfs_readdir_info info = { buf, filler };
if (strcmp(path, "/") != 0)
return -ENOENT;
filler(buf, ".", NULL, 0);
filler(buf, "..", NULL, 0);
iter_volumes(&info, blockfs_readdir_cb);
return 0;
}
static struct fuse_operations blockfs_oper = {
.getattr = blockfs_getattr,
.truncate = blockfs_truncate,
.open = blockfs_open,
.read = blockfs_read,
.write = blockfs_write,
.statfs = blockfs_statfs,
.fsync = blockfs_fsync,
.readdir = blockfs_readdir,
};
enum {
KEY_HELP,
KEY_VERSION,
};
static struct fuse_opt blockfs_opts[] = {
FUSE_OPT_KEY("-h", KEY_HELP),
FUSE_OPT_KEY("--help", KEY_HELP),
FUSE_OPT_KEY("-V", KEY_VERSION),
FUSE_OPT_KEY("--version", KEY_VERSION),
};
static void usage(const char *progname)
{
fprintf(stderr,
"Usage: %s backingdir mountpoint [options]\n"
"\n"
"General options:\n"
" -h --help print help\n"
" -V --version print version\n"
"\n", progname);
}
static char *backing_dir;
static int blockfs_opt_proc(void *data, const char *arg, int key,
struct fuse_args *outargs)
{
if (key == FUSE_OPT_KEY_NONOPT) {
if (backing_dir == NULL) {
backing_dir = strdup(arg);
return 0;
}
return 1;
}
if (key == KEY_HELP) {
usage(outargs->argv[0]);
fuse_opt_add_arg(outargs, "-ho");
fuse_main(outargs->argc, outargs->argv, &blockfs_oper, NULL);
exit(1);
}
if (key == KEY_VERSION) {
fprintf(stderr, "MERGE version: %s\n", PACKAGE_VERSION);
fuse_opt_add_arg(outargs, "--version");
fuse_main(outargs->argc, outargs->argv, &blockfs_oper, NULL);
exit(0);
}
return 1;
}
int main(int argc, char *argv[])
{
struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
if (fuse_opt_parse(&args, NULL, blockfs_opts, blockfs_opt_proc) == -1)
exit(1);
if (backing_dir == NULL) {
fprintf(stderr, "missing backing dir\n");
fprintf(stderr, "see '%s -h' for usage\n", argv[0]);
exit(1);
}
backing_dir_fd = open(backing_dir, O_RDONLY | O_DIRECTORY | O_NOATIME);
if (backing_dir_fd < 0) {
perror("open");
return 1;
}
pthread_mutex_init(&readdir_lock, NULL);
return fuse_main(args.argc, args.argv, &blockfs_oper, NULL);
}
Recently, I ran into the problem of needing an efficient way of doing
incremental backups of virtual machine images. I looked around a bit,
finding the patches that people have written to make rsync work on
block devices, but none of what was out there really suited my needs,
so I ended up hacking up something myself.
My first idea was to write a FUSE module to interpose between a block
device and a virtual representation of that block device in a FUSE
filesystem, relaying all I/O requests to the underlying block device,
but keeping a separate database of per-sector or per-block (or some
other granularity) mtimes. A separate tool would then compare that
database to that of a remote host, and perform an rsync-like operation
to bring the remote host in sync. While this would work, it seemed
overly complex for the job, and it would end up being a half-assed
reimplementation of rsync plus a filesystem.
So then I figured, since *NIX filesystems already track mtimes on a
per-inode basis, why not just keep the master copy of the block device
data in a set of, say, 1 MiB files, and have a FUSE module emulate a
larger file out of this, redirecting the I/Os to the component files?
This way, you don't need to track mtimes yourself (the underlying
filesystem will do that for you), and you can use standard tools like
rsync to synchronise the component files to another host.
The attached FUSE module implements this idea. To get started, do
something like this
cd /some/where
mkdir parts
mkdir parts/volume
for part in `seq 0 99`
do
dd if=/dev/zero of=parts/volume/$data bs=1024k count=1
done
echo 100 > parts/volume/num_parts
echo 1048576 > parts/volume/part_size
mkdir mount
~/bin/blockfs parts/ mount/
In mount/, a file 'volume' will then appear, which should be 100MiB
in size, and which you can dd over, create filesystems on, loopback
mount, etc. The underlying data for the volume will be stored in
the 100 1 MiB component files in parts/, with reads and writes to
the volume file being redirected to those component files.
When a write happens, blockfs will first read the original data for
that byte range from the component file, and if that matches, it won't
perform the write, to avoid updating the mtime on the component file.
With modern filesystems like ext4 and xfs, having many large files in
a single directory shouldn't incur a performance penalty like it would
on other filesystems, e.g. on ext3 without extents or htree, so this
approach should be okay.
There are some things left to be done:
- There should really be some kind of caching in blockfs -- not for
data, but for things like metadata and component file descriptors.
This is currently not done, as it doesn't seem possible to do things
like setting timers or asking the fuse main loop to poll on some
file descriptors for you (e.g. inotify fds for the component directory).
At some point I want to look into integrating the fuse main loop
with ivykis (http://libivykis.sourceforge.net/man3/ivykis.3.html)
to be able to address this.
- There should be an option for suspending I/O while a backup of the
backing store is made, so that you don't end up with a backup copy
of your block device that has had writes to it reordered.
- Establish some kind of rule of thumb for what the optimal component
file size is. This probably depends on hardware specs, method of
synchronisation the data to the slave cop{y,ies}, write pattern and
write intensity, etc, but it should be possible to figure out a sane
default (or a set of defaults) to recommend.
- The name -- 'blockfs' doesn't really cover the functionality of this
module very well. Any better ideas?
Any comments otherwise?
cheers,
Lennert
=== blockfs.c
#define PACKAGE_VERSION "0.1"
#define _GNU_SOURCE
#define FUSE_USE_VERSION 26
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <fuse.h>
#include <pthread.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
static int backing_dir_fd;
static pthread_mutex_t readdir_lock;
static int is_volume_dir(int dirfd)
{
struct stat buf;
if (fstatat(dirfd, "num_parts", &buf, 0) < 0)
return 0;
if (fstatat(dirfd, "part_size", &buf, 0) < 0)
return 0;
return 1;
}
static int open_volume_dir(const char *volume)
{
int dirfd;
dirfd = openat(backing_dir_fd, volume,
O_RDONLY | O_DIRECTORY | O_NOATIME);
if (dirfd < 0)
return -ENOENT;
if (!is_volume_dir(dirfd)) {
close(dirfd);
return -EINVAL;
}
return dirfd;
}
static void
iter_volumes(void *cookie, void (*iter)(void *cookie, char *volume, int dirfd))
{
int fd;
DIR *dirp;
struct dirent entry;
struct dirent *result;
pthread_mutex_lock(&readdir_lock);
fd = dup(backing_dir_fd);
if (fd < 0)
goto out;
dirp = fdopendir(fd);
if (dirp == NULL) {
close(fd);
goto out;
}
rewinddir(dirp);
while (readdir_r(dirp, &entry, &result) == 0) {
int dirfd;
if (result == NULL)
break;
if (result->d_type != DT_DIR && result->d_type != DT_UNKNOWN)
continue;
if (result->d_name[0] == '.')
continue;
dirfd = open_volume_dir(result->d_name);
if (dirfd < 0)
continue;
iter(cookie, result->d_name, dirfd);
close(dirfd);
}
closedir(dirp);
out:
pthread_mutex_unlock(&readdir_lock);
}
static int read_property(int dirfd, char *name, unsigned int *_value)
{
int fd;
FILE *f;
unsigned int value;
fd = openat(dirfd, name, O_RDONLY | O_NOATIME);
if (fd < 0)
return -1;
f = fdopen(fd, "r");
if (f == NULL) {
close(fd);
return -1;
}
if (fscanf(f, "%u", &value) != 1) {
fclose(f);
return -1;
}
*_value = value;
fclose(f);
return 0;
}
static void count_volumes_cb(void *cookie, char *volume, int dirfd)
{
(*((unsigned int *)cookie))++;
}
static int count_volumes(void)
{
unsigned int count = 0;
iter_volumes(&count, count_volumes_cb);
return count;
}
static int mkpartname(char *buf, int buf_size, unsigned int part)
{
return snprintf(buf, buf_size, "%d", part);
}
static int open_part(int dirfd, unsigned int part, int flags)
{
char partname[32];
mkpartname(partname, sizeof(partname), part);
return openat(dirfd, partname, flags | O_NOATIME);
}
struct blockfs_getattr_info {
unsigned int part_size;
int part_size_mismatches;
struct timespec st_atim;
struct timespec st_mtim;
struct timespec st_ctim;
};
static int timespec_cmp(struct timespec *a, struct timespec *b)
{
if (a->tv_sec > b->tv_sec)
return 1;
if (a->tv_sec < b->tv_sec)
return -1;
if (a->tv_nsec > b->tv_nsec)
return 1;
if (a->tv_nsec < b->tv_nsec)
return -1;
return 0;
}
static void blockfs_getattr_cb(void *_info, unsigned int part, struct stat *buf)
{
struct blockfs_getattr_info *info = _info;
if (buf->st_size != info->part_size)
info->part_size_mismatches++;
if (timespec_cmp(&buf->st_atim, &info->st_atim) > 0)
info->st_atim = buf->st_atim;
if (timespec_cmp(&buf->st_mtim, &info->st_mtim) > 0)
info->st_mtim = buf->st_mtim;
if (timespec_cmp(&buf->st_ctim, &info->st_ctim) > 0)
info->st_ctim = buf->st_ctim;
}
static int
iter_parts_stat(int dirfd, unsigned int num_parts, void *cookie,
void (*iter)(void *cookie, unsigned int part, struct stat *buf))
{
int i;
for (i = 0; i < num_parts; i++) {
char partname[32];
struct stat buf;
mkpartname(partname, sizeof(partname), i);
if (fstatat(dirfd, partname, &buf, 0) < 0) {
perror("fstatat");
close(dirfd);
return -1;
}
iter(cookie, i, &buf);
}
return 0;
}
static int blockfs_getattr(const char *path, struct stat *stbuf)
{
struct stat buf;
int dirfd;
unsigned int num_parts;
unsigned int part_size;
struct blockfs_getattr_info info;
if (path[0] == 0)
return -ENOENT;
memset(stbuf, 0, sizeof(struct stat));
if (strcmp(path, "/") == 0) {
int ret;
ret = fstat(backing_dir_fd, &buf);
if (ret < 0)
return -ENOENT;
stbuf->st_mode = buf.st_mode;
stbuf->st_nlink = 2 + count_volumes();
stbuf->st_uid = buf.st_uid;
stbuf->st_gid = buf.st_gid;
stbuf->st_size = 1024;
stbuf->st_blksize = 1024;
stbuf->st_blocks = 1;
stbuf->st_atim = buf.st_atim;
stbuf->st_mtim = buf.st_mtim;
stbuf->st_ctim = buf.st_ctim;
return 0;
}
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
if (fstat(dirfd, &buf) < 0) {
perror("fstat");
close(dirfd);
return -EINVAL;
}
stbuf->st_mode = S_IFREG | (buf.st_mode & 0666);
stbuf->st_nlink = 1;
stbuf->st_uid = buf.st_uid;
stbuf->st_gid = buf.st_gid;
stbuf->st_size = (uint64_t)num_parts * part_size;
stbuf->st_blksize = part_size;
stbuf->st_blocks = ((uint64_t)num_parts * part_size + 511) / 512;
memset(&info, 0, sizeof(info));
info.part_size = part_size;
if (iter_parts_stat(dirfd, num_parts, &info, blockfs_getattr_cb) ||
info.part_size_mismatches) {
close(dirfd);
return -EINVAL;
}
stbuf->st_atim = info.st_atim;
stbuf->st_mtim = info.st_mtim;
stbuf->st_ctim = info.st_ctim;
close(dirfd);
return 0;
}
static int blockfs_truncate(const char *path, off_t length)
{
/* Silently fail. */
return 0;
}
static int blockfs_open(const char *path, struct fuse_file_info *fi)
{
int dirfd;
unsigned int num_parts;
unsigned int part_size;
int mode;
int i;
if (path[0] == 0)
return -ENOENT;
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
mode = ((fi->flags & O_ACCMODE) == O_RDONLY) ? R_OK : W_OK;
for (i = 0; i < num_parts; i++) {
char partname[32];
mkpartname(partname, sizeof(partname), i);
if (faccessat(dirfd, partname, mode, 0) < 0) {
close(dirfd);
return -EPERM;
}
}
close(dirfd);
return 0;
}
static int
part_read(int dirfd, unsigned int part, off_t off, char *buf, int num)
{
int fd;
int ret;
fd = open_part(dirfd, part, O_RDONLY);
if (fd < 0)
return 0;
ret = pread(fd, buf, num, off);
if (ret < 0)
ret = -errno;
close(fd);
return ret;
}
static int blockfs_read(const char *path, char *buf, size_t size,
off_t offset, struct fuse_file_info *fi)
{
int dirfd;
size_t numread;
unsigned int num_parts;
unsigned int part_size;
if (path[0] == 0)
return -ENOENT;
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
numread = 0;
while (size) {
unsigned int part;
off_t part_off;
off_t part_end;
size_t toread;
int ret;
part = offset / part_size;
part_off = offset % part_size;
part_end = (part + 1) * (uint64_t)part_size;
toread = size;
if (offset + toread > part_end)
toread = part_end - offset;
ret = part_read(dirfd, part, part_off, buf, toread);
if (ret < 0) {
if (numread == 0)
numread = ret;
break;
}
buf += ret;
size -= ret;
offset += ret;
numread += ret;
if (ret != toread)
break;
}
close(dirfd);
return numread;
}
static int
part_write(int dirfd, unsigned int part, off_t off, const char *buf, int num)
{
char rbuf[num];
int fd;
int ret;
fd = open_part(dirfd, part, O_RDWR);
if (fd < 0)
return 0;
ret = pread(fd, rbuf, num, off);
if (ret != num || memcmp(buf, rbuf, num)) {
ret = pwrite(fd, buf, num, off);
if (ret < 0)
ret = -errno;
}
close(fd);
return ret;
}
static int blockfs_write(const char *path, const char *buf, size_t size,
off_t offset, struct fuse_file_info *fi)
{
int dirfd;
size_t numwritten;
unsigned int num_parts;
unsigned int part_size;
if (path[0] == 0)
return -ENOENT;
dirfd = open_volume_dir(path + 1);
if (dirfd < 0)
return -ENOENT;
if (read_property(dirfd, "num_parts", &num_parts) < 0) {
close(dirfd);
return -EINVAL;
}
if (read_property(dirfd, "part_size", &part_size) < 0) {
close(dirfd);
return -EINVAL;
}
numwritten = 0;
while (size) {
unsigned int part;
off_t part_off;
off_t part_end;
size_t towrite;
int ret;
part = offset / part_size;
part_off = offset % part_size;
part_end = (part + 1) * (uint64_t)part_size;
towrite = size;
if (offset + towrite > part_end)
towrite = part_end - offset;
ret = part_write(dirfd, part, part_off, buf, towrite);
if (ret < 0) {
if (numwritten == 0)
numwritten = ret;
break;
}
buf += ret;
size -= ret;
offset += ret;
numwritten += ret;
if (ret != towrite)
break;
}
close(dirfd);
return numwritten;
}
static void
blockfs_statfs_part_cb(void *num, unsigned int part, struct stat *buf)
{
((uint64_t *)num)[1] += buf->st_size;
}
static void blockfs_statfs_volume_cb(void *num, char *volume, int dirfd)
{
unsigned int num_parts;
((uint64_t *)num)[0]++;
if (read_property(dirfd, "num_parts", &num_parts) >= 0)
iter_parts_stat(dirfd, num_parts, num, blockfs_statfs_part_cb);
}
static int blockfs_statfs(const char *path, struct statvfs *buf)
{
uint64_t num[2];
num[0] = 1;
num[1] = 0;
iter_volumes(num, blockfs_statfs_volume_cb);
buf->f_bsize = 4096;
buf->f_frsize = 4096;
buf->f_blocks = num[1] / 4096;
buf->f_bfree = 0;
buf->f_bavail = 0;
buf->f_files = num[0];
buf->f_ffree = 0;
buf->f_favail = 0;
buf->f_fsid = 0;
buf->f_flag = 0;
buf->f_namemax = PATH_MAX;
return 0;
}
static int blockfs_fsync(const char *path, int datasync,
struct fuse_file_info *fi)
{
/* FIXME. */
sync();
return 0;
}
struct blockfs_readdir_info {
void *buf;
fuse_fill_dir_t filler;
};
static void blockfs_readdir_cb(void *_info, char *name, int dirfd)
{
struct blockfs_readdir_info *info = _info;
info->filler(info->buf, name, NULL, 0);
}
static int blockfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
off_t offset, struct fuse_file_info *fi)
{
struct blockfs_readdir_info info = { buf, filler };
if (strcmp(path, "/") != 0)
return -ENOENT;
filler(buf, ".", NULL, 0);
filler(buf, "..", NULL, 0);
iter_volumes(&info, blockfs_readdir_cb);
return 0;
}
static struct fuse_operations blockfs_oper = {
.getattr = blockfs_getattr,
.truncate = blockfs_truncate,
.open = blockfs_open,
.read = blockfs_read,
.write = blockfs_write,
.statfs = blockfs_statfs,
.fsync = blockfs_fsync,
.readdir = blockfs_readdir,
};
enum {
KEY_HELP,
KEY_VERSION,
};
static struct fuse_opt blockfs_opts[] = {
FUSE_OPT_KEY("-h", KEY_HELP),
FUSE_OPT_KEY("--help", KEY_HELP),
FUSE_OPT_KEY("-V", KEY_VERSION),
FUSE_OPT_KEY("--version", KEY_VERSION),
};
static void usage(const char *progname)
{
fprintf(stderr,
"Usage: %s backingdir mountpoint [options]\n"
"\n"
"General options:\n"
" -h --help print help\n"
" -V --version print version\n"
"\n", progname);
}
static char *backing_dir;
static int blockfs_opt_proc(void *data, const char *arg, int key,
struct fuse_args *outargs)
{
if (key == FUSE_OPT_KEY_NONOPT) {
if (backing_dir == NULL) {
backing_dir = strdup(arg);
return 0;
}
return 1;
}
if (key == KEY_HELP) {
usage(outargs->argv[0]);
fuse_opt_add_arg(outargs, "-ho");
fuse_main(outargs->argc, outargs->argv, &blockfs_oper, NULL);
exit(1);
}
if (key == KEY_VERSION) {
fprintf(stderr, "MERGE version: %s\n", PACKAGE_VERSION);
fuse_opt_add_arg(outargs, "--version");
fuse_main(outargs->argc, outargs->argv, &blockfs_oper, NULL);
exit(0);
}
return 1;
}
int main(int argc, char *argv[])
{
struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
if (fuse_opt_parse(&args, NULL, blockfs_opts, blockfs_opt_proc) == -1)
exit(1);
if (backing_dir == NULL) {
fprintf(stderr, "missing backing dir\n");
fprintf(stderr, "see '%s -h' for usage\n", argv[0]);
exit(1);
}
backing_dir_fd = open(backing_dir, O_RDONLY | O_DIRECTORY | O_NOATIME);
if (backing_dir_fd < 0) {
perror("open");
return 1;
}
pthread_mutex_init(&readdir_lock, NULL);
return fuse_main(args.argc, args.argv, &blockfs_oper, NULL);
}