You've already forked linux-apfs
mirror of
https://github.com/linux-apfs/linux-apfs.git
synced 2026-05-01 15:00:59 -07:00
[LogFS] add new flash file system
This is a new flash file system. See Documentation/filesystems/logfs.txt Signed-off-by: Joern Engel <joern@logfs.org>
This commit is contained in:
@@ -62,6 +62,8 @@ jfs.txt
|
||||
- info and mount options for the JFS filesystem.
|
||||
locks.txt
|
||||
- info on file locking implementations, flock() vs. fcntl(), etc.
|
||||
logfs.txt
|
||||
- info on the LogFS flash filesystem.
|
||||
mandatory-locking.txt
|
||||
- info on the Linux implementation of Sys V mandatory file locking.
|
||||
ncpfs.txt
|
||||
|
||||
@@ -0,0 +1,241 @@
|
||||
|
||||
The LogFS Flash Filesystem
|
||||
==========================
|
||||
|
||||
Specification
|
||||
=============
|
||||
|
||||
Superblocks
|
||||
-----------
|
||||
|
||||
Two superblocks exist at the beginning and end of the filesystem.
|
||||
Each superblock is 256 Bytes large, with another 3840 Bytes reserved
|
||||
for future purposes, making a total of 4096 Bytes.
|
||||
|
||||
Superblock locations may differ for MTD and block devices. On MTD the
|
||||
first non-bad block contains a superblock in the first 4096 Bytes and
|
||||
the last non-bad block contains a superblock in the last 4096 Bytes.
|
||||
On block devices, the first 4096 Bytes of the device contain the first
|
||||
superblock and the last aligned 4096 Byte-block contains the second
|
||||
superblock.
|
||||
|
||||
For the most part, the superblocks can be considered read-only. They
|
||||
are written only to correct errors detected within the superblocks,
|
||||
move the journal and change the filesystem parameters through tunefs.
|
||||
As a result, the superblock does not contain any fields that require
|
||||
constant updates, like the amount of free space, etc.
|
||||
|
||||
Segments
|
||||
--------
|
||||
|
||||
The space in the device is split up into equal-sized segments.
|
||||
Segments are the primary write unit of LogFS. Within each segments,
|
||||
writes happen from front (low addresses) to back (high addresses. If
|
||||
only a partial segment has been written, the segment number, the
|
||||
current position within and optionally a write buffer are stored in
|
||||
the journal.
|
||||
|
||||
Segments are erased as a whole. Therefore Garbage Collection may be
|
||||
required to completely free a segment before doing so.
|
||||
|
||||
Journal
|
||||
--------
|
||||
|
||||
The journal contains all global information about the filesystem that
|
||||
is subject to frequent change. At mount time, it has to be scanned
|
||||
for the most recent commit entry, which contains a list of pointers to
|
||||
all currently valid entries.
|
||||
|
||||
Object Store
|
||||
------------
|
||||
|
||||
All space except for the superblocks and journal is part of the object
|
||||
store. Each segment contains a segment header and a number of
|
||||
objects, each consisting of the object header and the payload.
|
||||
Objects are either inodes, directory entries (dentries), file data
|
||||
blocks or indirect blocks.
|
||||
|
||||
Levels
|
||||
------
|
||||
|
||||
Garbage collection (GC) may fail if all data is written
|
||||
indiscriminately. One requirement of GC is that data is seperated
|
||||
roughly according to the distance between the tree root and the data.
|
||||
Effectively that means all file data is on level 0, indirect blocks
|
||||
are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
|
||||
respectively. Inode file data is on level 6 for the inodes and 7-11
|
||||
for indirect blocks.
|
||||
|
||||
Each segment contains objects of a single level only. As a result,
|
||||
each level requires its own seperate segment to be open for writing.
|
||||
|
||||
Inode File
|
||||
----------
|
||||
|
||||
All inodes are stored in a special file, the inode file. Single
|
||||
exception is the inode file's inode (master inode) which for obvious
|
||||
reasons is stored in the journal instead. Instead of data blocks, the
|
||||
leaf nodes of the inode files are inodes.
|
||||
|
||||
Aliases
|
||||
-------
|
||||
|
||||
Writes in LogFS are done by means of a wandering tree. A naïve
|
||||
implementation would require that for each write or a block, all
|
||||
parent blocks are written as well, since the block pointers have
|
||||
changed. Such an implementation would not be very efficient.
|
||||
|
||||
In LogFS, the block pointer changes are cached in the journal by means
|
||||
of alias entries. Each alias consists of its logical address - inode
|
||||
number, block index, level and child number (index into block) - and
|
||||
the changed data. Any 8-byte word can be changes in this manner.
|
||||
|
||||
Currently aliases are used for block pointers, file size, file used
|
||||
bytes and the height of an inodes indirect tree.
|
||||
|
||||
Segment Aliases
|
||||
---------------
|
||||
|
||||
Related to regular aliases, these are used to handle bad blocks.
|
||||
Initially, bad blocks are handled by moving the affected segment
|
||||
content to a spare segment and noting this move in the journal with a
|
||||
segment alias, a simple (to, from) tupel. GC will later empty this
|
||||
segment and the alias can be removed again. This is used on MTD only.
|
||||
|
||||
Vim
|
||||
---
|
||||
|
||||
By cleverly predicting the life time of data, it is possible to
|
||||
seperate long-living data from short-living data and thereby reduce
|
||||
the GC overhead later. Each type of distinc life expectency (vim) can
|
||||
have a seperate segment open for writing. Each (level, vim) tupel can
|
||||
be open just once. If an open segment with unknown vim is encountered
|
||||
at mount time, it is closed and ignored henceforth.
|
||||
|
||||
Indirect Tree
|
||||
-------------
|
||||
|
||||
Inodes in LogFS are similar to FFS-style filesystems with direct and
|
||||
indirect block pointers. One difference is that LogFS uses a single
|
||||
indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
|
||||
A height field in the inode defines the height of the indirect tree
|
||||
and thereby the indirection of the pointer.
|
||||
|
||||
Another difference is the addressing of indirect blocks. In LogFS,
|
||||
the first 16 pointers in the first indirect block are left empty,
|
||||
corresponding to the 16 direct pointers in the inode. In ext2 (maybe
|
||||
others as well) the first pointer in the first indirect block
|
||||
corresponds to logical block 12, skipping the 12 direct pointers.
|
||||
So where ext2 is using arithmetic to better utilize space, LogFS keeps
|
||||
arithmetic simple and uses compression to save space.
|
||||
|
||||
Compression
|
||||
-----------
|
||||
|
||||
Both file data and metadata can be compressed. Compression for file
|
||||
data can be enabled with chattr +c and disabled with chattr -c. Doing
|
||||
so has no effect on existing data, but new data will be stored
|
||||
accordingly. New inodes will inherit the compression flag of the
|
||||
parent directory.
|
||||
|
||||
Metadata is always compressed. However, the space accounting ignores
|
||||
this and charges for the uncompressed size. Failing to do so could
|
||||
result in GC failures when, after moving some data, indirect blocks
|
||||
compress worse than previously. Even on a 100% full medium, GC may
|
||||
not consume any extra space, so the compression gains are lost space
|
||||
to the user.
|
||||
|
||||
However, they are not lost space to the filesystem internals. By
|
||||
cheating the user for those bytes, the filesystem gained some slack
|
||||
space and GC will run less often and faster.
|
||||
|
||||
Garbage Collection and Wear Leveling
|
||||
------------------------------------
|
||||
|
||||
Garbage collection is invoked whenever the number of free segments
|
||||
falls below a threshold. The best (known) candidate is picked based
|
||||
on the least amount of valid data contained in the segment. All
|
||||
remaining valid data is copied elsewhere, thereby invalidating it.
|
||||
|
||||
The GC code also checks for aliases and writes then back if their
|
||||
number gets too large.
|
||||
|
||||
Wear leveling is done by occasionally picking a suboptimal segment for
|
||||
garbage collection. If a stale segments erase count is significantly
|
||||
lower than the active segments' erase counts, it will be picked. Wear
|
||||
leveling is rate limited, so it will never monopolize the device for
|
||||
more than one segment worth at a time.
|
||||
|
||||
Values for "occasionally", "significantly lower" are compile time
|
||||
constants.
|
||||
|
||||
Hashed directories
|
||||
------------------
|
||||
|
||||
To satisfy efficient lookup(), directory entries are hashed and
|
||||
located based on the hash. In order to both support large directories
|
||||
and not be overly inefficient for small directories, several hash
|
||||
tables of increasing size are used. For each table, the hash value
|
||||
modulo the table size gives the table index.
|
||||
|
||||
Tables sizes are chosen to limit the number of indirect blocks with a
|
||||
fully populated table to 0, 1, 2 or 3 respectively. So the first
|
||||
table contains 16 entries, the second 512-16, etc.
|
||||
|
||||
The last table is special in several ways. First its size depends on
|
||||
the effective 32bit limit on telldir/seekdir cookies. Since logfs
|
||||
uses the upper half of the address space for indirect blocks, the size
|
||||
is limited to 2^31. Secondly the table contains hash buckets with 16
|
||||
entries each.
|
||||
|
||||
Using single-entry buckets would result in birthday "attacks". At
|
||||
just 2^16 used entries, hash collisions would be likely (P >= 0.5).
|
||||
My math skills are insufficient to do the combinatorics for the 17x
|
||||
collisions necessary to overflow a bucket, but testing showed that in
|
||||
10,000 runs the lowest directory fill before a bucket overflow was
|
||||
188,057,130 entries with an average of 315,149,915 entries. So for
|
||||
directory sizes of up to a million, bucket overflows should be
|
||||
virtually impossible under normal circumstances.
|
||||
|
||||
With carefully chosen filenames, it is obviously possible to cause an
|
||||
overflow with just 21 entries (4 higher tables + 16 entries + 1). So
|
||||
there may be a security concern if a malicious user has write access
|
||||
to a directory.
|
||||
|
||||
Open For Discussion
|
||||
===================
|
||||
|
||||
Device Address Space
|
||||
--------------------
|
||||
|
||||
A device address space is used for caching. Both block devices and
|
||||
MTD provide functions to either read a single page or write a segment.
|
||||
Partial segments may be written for data integrity, but where possible
|
||||
complete segments are written for performance on simple block device
|
||||
flash media.
|
||||
|
||||
Meta Inodes
|
||||
-----------
|
||||
|
||||
Inodes are stored in the inode file, which is just a regular file for
|
||||
most purposes. At umount time, however, the inode file needs to
|
||||
remain open until all dirty inodes are written. So
|
||||
generic_shutdown_super() may not close this inode, but shouldn't
|
||||
complain about remaining inodes due to the inode file either. Same
|
||||
goes for mapping inode of the device address space.
|
||||
|
||||
Currently logfs uses a hack that essentially copies part of fs/inode.c
|
||||
code over. A general solution would be preferred.
|
||||
|
||||
Indirect block mapping
|
||||
----------------------
|
||||
|
||||
With compression, the block device (or mapping inode) cannot be used
|
||||
to cache indirect blocks. Some other place is required. Currently
|
||||
logfs uses the top half of each inode's address space. The low 8TB
|
||||
(on 32bit) are filled with file data, the high 8TB are used for
|
||||
indirect blocks.
|
||||
|
||||
One problem is that 16TB files created on 64bit systems actually have
|
||||
data in the top 8TB. But files >16TB would cause problems anyway, so
|
||||
only the limit has changed.
|
||||
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
|
||||
source "fs/jffs2/Kconfig"
|
||||
# UBIFS File system configuration
|
||||
source "fs/ubifs/Kconfig"
|
||||
source "fs/logfs/Kconfig"
|
||||
source "fs/cramfs/Kconfig"
|
||||
source "fs/squashfs/Kconfig"
|
||||
source "fs/freevxfs/Kconfig"
|
||||
|
||||
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
|
||||
obj-$(CONFIG_UFS_FS) += ufs/
|
||||
obj-$(CONFIG_EFS_FS) += efs/
|
||||
obj-$(CONFIG_JFFS2_FS) += jffs2/
|
||||
obj-$(CONFIG_LOGFS) += logfs/
|
||||
obj-$(CONFIG_UBIFS_FS) += ubifs/
|
||||
obj-$(CONFIG_AFFS_FS) += affs/
|
||||
obj-$(CONFIG_ROMFS_FS) += romfs/
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
config LOGFS
|
||||
tristate "LogFS file system (EXPERIMENTAL)"
|
||||
depends on (MTD || BLOCK) && EXPERIMENTAL
|
||||
select ZLIB_INFLATE
|
||||
select ZLIB_DEFLATE
|
||||
select CRC32
|
||||
select BTREE
|
||||
help
|
||||
Flash filesystem aimed to scale efficiently to large devices.
|
||||
In comparison to JFFS2 it offers significantly faster mount
|
||||
times and potentially less RAM usage, although the latter has
|
||||
not been measured yet.
|
||||
|
||||
In its current state it is still very experimental and should
|
||||
not be used for other than testing purposes.
|
||||
|
||||
If unsure, say N.
|
||||
@@ -0,0 +1,13 @@
|
||||
obj-$(CONFIG_LOGFS) += logfs.o
|
||||
|
||||
logfs-y += compr.o
|
||||
logfs-y += dir.o
|
||||
logfs-y += file.o
|
||||
logfs-y += gc.o
|
||||
logfs-y += inode.o
|
||||
logfs-y += journal.o
|
||||
logfs-y += readwrite.o
|
||||
logfs-y += segment.o
|
||||
logfs-y += super.o
|
||||
logfs-$(CONFIG_BLOCK) += dev_bdev.o
|
||||
logfs-$(CONFIG_MTD) += dev_mtd.o
|
||||
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* fs/logfs/compr.c - compression routines
|
||||
*
|
||||
* As should be obvious for Linux kernel code, license is GPLv2
|
||||
*
|
||||
* Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
|
||||
*/
|
||||
#include "logfs.h"
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/zlib.h>
|
||||
|
||||
#define COMPR_LEVEL 3
|
||||
|
||||
static DEFINE_MUTEX(compr_mutex);
|
||||
static struct z_stream_s stream;
|
||||
|
||||
int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
|
||||
{
|
||||
int err, ret;
|
||||
|
||||
ret = -EIO;
|
||||
mutex_lock(&compr_mutex);
|
||||
err = zlib_deflateInit(&stream, COMPR_LEVEL);
|
||||
if (err != Z_OK)
|
||||
goto error;
|
||||
|
||||
stream.next_in = in;
|
||||
stream.avail_in = inlen;
|
||||
stream.total_in = 0;
|
||||
stream.next_out = out;
|
||||
stream.avail_out = outlen;
|
||||
stream.total_out = 0;
|
||||
|
||||
err = zlib_deflate(&stream, Z_FINISH);
|
||||
if (err != Z_STREAM_END)
|
||||
goto error;
|
||||
|
||||
err = zlib_deflateEnd(&stream);
|
||||
if (err != Z_OK)
|
||||
goto error;
|
||||
|
||||
if (stream.total_out >= stream.total_in)
|
||||
goto error;
|
||||
|
||||
ret = stream.total_out;
|
||||
error:
|
||||
mutex_unlock(&compr_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
|
||||
{
|
||||
int err, ret;
|
||||
|
||||
ret = -EIO;
|
||||
mutex_lock(&compr_mutex);
|
||||
err = zlib_inflateInit(&stream);
|
||||
if (err != Z_OK)
|
||||
goto error;
|
||||
|
||||
stream.next_in = in;
|
||||
stream.avail_in = inlen;
|
||||
stream.total_in = 0;
|
||||
stream.next_out = out;
|
||||
stream.avail_out = outlen;
|
||||
stream.total_out = 0;
|
||||
|
||||
err = zlib_inflate(&stream, Z_FINISH);
|
||||
if (err != Z_STREAM_END)
|
||||
goto error;
|
||||
|
||||
err = zlib_inflateEnd(&stream);
|
||||
if (err != Z_OK)
|
||||
goto error;
|
||||
|
||||
ret = 0;
|
||||
error:
|
||||
mutex_unlock(&compr_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __init logfs_compr_init(void)
|
||||
{
|
||||
size_t size = max(zlib_deflate_workspacesize(),
|
||||
zlib_inflate_workspacesize());
|
||||
stream.workspace = vmalloc(size);
|
||||
if (!stream.workspace)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void logfs_compr_exit(void)
|
||||
{
|
||||
vfree(stream.workspace);
|
||||
}
|
||||
@@ -0,0 +1,263 @@
|
||||
/*
|
||||
* fs/logfs/dev_bdev.c - Device access methods for block devices
|
||||
*
|
||||
* As should be obvious for Linux kernel code, license is GPLv2
|
||||
*
|
||||
* Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
|
||||
*/
|
||||
#include "logfs.h"
|
||||
#include <linux/bio.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/buffer_head.h>
|
||||
|
||||
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
|
||||
|
||||
static void request_complete(struct bio *bio, int err)
|
||||
{
|
||||
complete((struct completion *)bio->bi_private);
|
||||
}
|
||||
|
||||
static int sync_request(struct page *page, struct block_device *bdev, int rw)
|
||||
{
|
||||
struct bio bio;
|
||||
struct bio_vec bio_vec;
|
||||
struct completion complete;
|
||||
|
||||
bio_init(&bio);
|
||||
bio.bi_io_vec = &bio_vec;
|
||||
bio_vec.bv_page = page;
|
||||
bio_vec.bv_len = PAGE_SIZE;
|
||||
bio_vec.bv_offset = 0;
|
||||
bio.bi_vcnt = 1;
|
||||
bio.bi_idx = 0;
|
||||
bio.bi_size = PAGE_SIZE;
|
||||
bio.bi_bdev = bdev;
|
||||
bio.bi_sector = page->index * (PAGE_SIZE >> 9);
|
||||
init_completion(&complete);
|
||||
bio.bi_private = &complete;
|
||||
bio.bi_end_io = request_complete;
|
||||
|
||||
submit_bio(rw, &bio);
|
||||
generic_unplug_device(bdev_get_queue(bdev));
|
||||
wait_for_completion(&complete);
|
||||
return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
|
||||
}
|
||||
|
||||
static int bdev_readpage(void *_sb, struct page *page)
|
||||
{
|
||||
struct super_block *sb = _sb;
|
||||
struct block_device *bdev = logfs_super(sb)->s_bdev;
|
||||
int err;
|
||||
|
||||
err = sync_request(page, bdev, READ);
|
||||
if (err) {
|
||||
ClearPageUptodate(page);
|
||||
SetPageError(page);
|
||||
} else {
|
||||
SetPageUptodate(page);
|
||||
ClearPageError(page);
|
||||
}
|
||||
unlock_page(page);
|
||||
return err;
|
||||
}
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(wq);
|
||||
|
||||
static void writeseg_end_io(struct bio *bio, int err)
|
||||
{
|
||||
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
|
||||
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
|
||||
struct super_block *sb = bio->bi_private;
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct page *page;
|
||||
|
||||
BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
|
||||
BUG_ON(err);
|
||||
BUG_ON(bio->bi_vcnt == 0);
|
||||
do {
|
||||
page = bvec->bv_page;
|
||||
if (--bvec >= bio->bi_io_vec)
|
||||
prefetchw(&bvec->bv_page->flags);
|
||||
|
||||
end_page_writeback(page);
|
||||
} while (bvec >= bio->bi_io_vec);
|
||||
bio_put(bio);
|
||||
if (atomic_dec_and_test(&super->s_pending_writes))
|
||||
wake_up(&wq);
|
||||
}
|
||||
|
||||
static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
|
||||
size_t nr_pages)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
struct bio *bio;
|
||||
struct page *page;
|
||||
struct request_queue *q = bdev_get_queue(sb->s_bdev);
|
||||
unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
|
||||
int i;
|
||||
|
||||
bio = bio_alloc(GFP_NOFS, max_pages);
|
||||
BUG_ON(!bio); /* FIXME: handle this */
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
if (i >= max_pages) {
|
||||
/* Block layer cannot split bios :( */
|
||||
bio->bi_vcnt = i;
|
||||
bio->bi_idx = 0;
|
||||
bio->bi_size = i * PAGE_SIZE;
|
||||
bio->bi_bdev = super->s_bdev;
|
||||
bio->bi_sector = ofs >> 9;
|
||||
bio->bi_private = sb;
|
||||
bio->bi_end_io = writeseg_end_io;
|
||||
atomic_inc(&super->s_pending_writes);
|
||||
submit_bio(WRITE, bio);
|
||||
|
||||
ofs += i * PAGE_SIZE;
|
||||
index += i;
|
||||
nr_pages -= i;
|
||||
i = 0;
|
||||
|
||||
bio = bio_alloc(GFP_NOFS, max_pages);
|
||||
BUG_ON(!bio);
|
||||
}
|
||||
page = find_lock_page(mapping, index + i);
|
||||
BUG_ON(!page);
|
||||
bio->bi_io_vec[i].bv_page = page;
|
||||
bio->bi_io_vec[i].bv_len = PAGE_SIZE;
|
||||
bio->bi_io_vec[i].bv_offset = 0;
|
||||
|
||||
BUG_ON(PageWriteback(page));
|
||||
set_page_writeback(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
bio->bi_vcnt = nr_pages;
|
||||
bio->bi_idx = 0;
|
||||
bio->bi_size = nr_pages * PAGE_SIZE;
|
||||
bio->bi_bdev = super->s_bdev;
|
||||
bio->bi_sector = ofs >> 9;
|
||||
bio->bi_private = sb;
|
||||
bio->bi_end_io = writeseg_end_io;
|
||||
atomic_inc(&super->s_pending_writes);
|
||||
submit_bio(WRITE, bio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
int head;
|
||||
|
||||
BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
|
||||
|
||||
if (len == 0) {
|
||||
/* This can happen when the object fit perfectly into a
|
||||
* segment, the segment gets written per sync and subsequently
|
||||
* closed.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
head = ofs & (PAGE_SIZE - 1);
|
||||
if (head) {
|
||||
ofs -= head;
|
||||
len += head;
|
||||
}
|
||||
len = PAGE_ALIGN(len);
|
||||
__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
|
||||
generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
|
||||
}
|
||||
|
||||
static int bdev_erase(struct super_block *sb, loff_t to, size_t len)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
struct page *page;
|
||||
pgoff_t index = to >> PAGE_SHIFT;
|
||||
int i, nr_pages = len >> PAGE_SHIFT;
|
||||
|
||||
BUG_ON(to & (PAGE_SIZE - 1));
|
||||
BUG_ON(len & (PAGE_SIZE - 1));
|
||||
|
||||
if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
|
||||
return -EROFS;
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
page = find_get_page(mapping, index + i);
|
||||
if (page) {
|
||||
memset(page_address(page), 0xFF, PAGE_SIZE);
|
||||
page_cache_release(page);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bdev_sync(struct super_block *sb)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
|
||||
wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
|
||||
}
|
||||
|
||||
static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
filler_t *filler = bdev_readpage;
|
||||
|
||||
*ofs = 0;
|
||||
return read_cache_page(mapping, 0, filler, sb);
|
||||
}
|
||||
|
||||
static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
filler_t *filler = bdev_readpage;
|
||||
u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
|
||||
pgoff_t index = pos >> PAGE_SHIFT;
|
||||
|
||||
*ofs = pos;
|
||||
return read_cache_page(mapping, index, filler, sb);
|
||||
}
|
||||
|
||||
static int bdev_write_sb(struct super_block *sb, struct page *page)
|
||||
{
|
||||
struct block_device *bdev = logfs_super(sb)->s_bdev;
|
||||
|
||||
/* Nothing special to do for block devices. */
|
||||
return sync_request(page, bdev, WRITE);
|
||||
}
|
||||
|
||||
static void bdev_put_device(struct super_block *sb)
|
||||
{
|
||||
close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
|
||||
}
|
||||
|
||||
static const struct logfs_device_ops bd_devops = {
|
||||
.find_first_sb = bdev_find_first_sb,
|
||||
.find_last_sb = bdev_find_last_sb,
|
||||
.write_sb = bdev_write_sb,
|
||||
.readpage = bdev_readpage,
|
||||
.writeseg = bdev_writeseg,
|
||||
.erase = bdev_erase,
|
||||
.sync = bdev_sync,
|
||||
.put_device = bdev_put_device,
|
||||
};
|
||||
|
||||
int logfs_get_sb_bdev(struct file_system_type *type, int flags,
|
||||
const char *devname, struct vfsmount *mnt)
|
||||
{
|
||||
struct block_device *bdev;
|
||||
|
||||
bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
|
||||
if (IS_ERR(bdev))
|
||||
return PTR_ERR(bdev);
|
||||
|
||||
if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
|
||||
int mtdnr = MINOR(bdev->bd_dev);
|
||||
close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
|
||||
return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
|
||||
}
|
||||
|
||||
return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
|
||||
}
|
||||
@@ -0,0 +1,253 @@
|
||||
/*
|
||||
* fs/logfs/dev_mtd.c - Device access methods for MTD
|
||||
*
|
||||
* As should be obvious for Linux kernel code, license is GPLv2
|
||||
*
|
||||
* Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
|
||||
*/
|
||||
#include "logfs.h"
|
||||
#include <linux/completion.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
|
||||
|
||||
static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
|
||||
{
|
||||
struct mtd_info *mtd = logfs_super(sb)->s_mtd;
|
||||
size_t retlen;
|
||||
int ret;
|
||||
|
||||
ret = mtd->read(mtd, ofs, len, &retlen, buf);
|
||||
BUG_ON(ret == -EINVAL);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* Not sure if we should loop instead. */
|
||||
if (retlen != len)
|
||||
return -EIO;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct mtd_info *mtd = super->s_mtd;
|
||||
size_t retlen;
|
||||
loff_t page_start, page_end;
|
||||
int ret;
|
||||
|
||||
if (super->s_flags & LOGFS_SB_FLAG_RO)
|
||||
return -EROFS;
|
||||
|
||||
BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
|
||||
BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
|
||||
BUG_ON(len > PAGE_CACHE_SIZE);
|
||||
page_start = ofs & PAGE_CACHE_MASK;
|
||||
page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
|
||||
ret = mtd->write(mtd, ofs, len, &retlen, buf);
|
||||
if (ret || (retlen != len))
|
||||
return -EIO;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* For as long as I can remember (since about 2001) mtd->erase has been an
|
||||
* asynchronous interface lacking the first driver to actually use the
|
||||
* asynchronous properties. So just to prevent the first implementor of such
|
||||
* a thing from breaking logfs in 2350, we do the usual pointless dance to
|
||||
* declare a completion variable and wait for completion before returning
|
||||
* from mtd_erase(). What an excercise in futility!
|
||||
*/
|
||||
static void logfs_erase_callback(struct erase_info *ei)
|
||||
{
|
||||
complete((struct completion *)ei->priv);
|
||||
}
|
||||
|
||||
static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
struct page *page;
|
||||
pgoff_t index = ofs >> PAGE_SHIFT;
|
||||
|
||||
for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
|
||||
page = find_get_page(mapping, index);
|
||||
if (!page)
|
||||
continue;
|
||||
memset(page_address(page), 0xFF, PAGE_SIZE);
|
||||
page_cache_release(page);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len)
|
||||
{
|
||||
struct mtd_info *mtd = logfs_super(sb)->s_mtd;
|
||||
struct erase_info ei;
|
||||
DECLARE_COMPLETION_ONSTACK(complete);
|
||||
int ret;
|
||||
|
||||
BUG_ON(len % mtd->erasesize);
|
||||
if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
|
||||
return -EROFS;
|
||||
|
||||
memset(&ei, 0, sizeof(ei));
|
||||
ei.mtd = mtd;
|
||||
ei.addr = ofs;
|
||||
ei.len = len;
|
||||
ei.callback = logfs_erase_callback;
|
||||
ei.priv = (long)&complete;
|
||||
ret = mtd->erase(mtd, &ei);
|
||||
if (ret)
|
||||
return -EIO;
|
||||
|
||||
wait_for_completion(&complete);
|
||||
if (ei.state != MTD_ERASE_DONE)
|
||||
return -EIO;
|
||||
return mtd_erase_mapping(sb, ofs, len);
|
||||
}
|
||||
|
||||
static void mtd_sync(struct super_block *sb)
|
||||
{
|
||||
struct mtd_info *mtd = logfs_super(sb)->s_mtd;
|
||||
|
||||
if (mtd->sync)
|
||||
mtd->sync(mtd);
|
||||
}
|
||||
|
||||
static int mtd_readpage(void *_sb, struct page *page)
|
||||
{
|
||||
struct super_block *sb = _sb;
|
||||
int err;
|
||||
|
||||
err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
|
||||
page_address(page));
|
||||
if (err == -EUCLEAN) {
|
||||
err = 0;
|
||||
/* FIXME: force GC this segment */
|
||||
}
|
||||
if (err) {
|
||||
ClearPageUptodate(page);
|
||||
SetPageError(page);
|
||||
} else {
|
||||
SetPageUptodate(page);
|
||||
ClearPageError(page);
|
||||
}
|
||||
unlock_page(page);
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
filler_t *filler = mtd_readpage;
|
||||
struct mtd_info *mtd = super->s_mtd;
|
||||
|
||||
if (!mtd->block_isbad)
|
||||
return NULL;
|
||||
|
||||
*ofs = 0;
|
||||
while (mtd->block_isbad(mtd, *ofs)) {
|
||||
*ofs += mtd->erasesize;
|
||||
if (*ofs >= mtd->size)
|
||||
return NULL;
|
||||
}
|
||||
BUG_ON(*ofs & ~PAGE_MASK);
|
||||
return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
|
||||
}
|
||||
|
||||
static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
filler_t *filler = mtd_readpage;
|
||||
struct mtd_info *mtd = super->s_mtd;
|
||||
|
||||
if (!mtd->block_isbad)
|
||||
return NULL;
|
||||
|
||||
*ofs = mtd->size - mtd->erasesize;
|
||||
while (mtd->block_isbad(mtd, *ofs)) {
|
||||
*ofs -= mtd->erasesize;
|
||||
if (*ofs <= 0)
|
||||
return NULL;
|
||||
}
|
||||
*ofs = *ofs + mtd->erasesize - 0x1000;
|
||||
BUG_ON(*ofs & ~PAGE_MASK);
|
||||
return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
|
||||
}
|
||||
|
||||
static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
|
||||
size_t nr_pages)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct address_space *mapping = super->s_mapping_inode->i_mapping;
|
||||
struct page *page;
|
||||
int i, err;
|
||||
|
||||
for (i = 0; i < nr_pages; i++) {
|
||||
page = find_lock_page(mapping, index + i);
|
||||
BUG_ON(!page);
|
||||
|
||||
err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
|
||||
page_address(page));
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
int head;
|
||||
|
||||
if (super->s_flags & LOGFS_SB_FLAG_RO)
|
||||
return;
|
||||
|
||||
if (len == 0) {
|
||||
/* This can happen when the object fit perfectly into a
|
||||
* segment, the segment gets written per sync and subsequently
|
||||
* closed.
|
||||
*/
|
||||
return;
|
||||
}
|
||||
head = ofs & (PAGE_SIZE - 1);
|
||||
if (head) {
|
||||
ofs -= head;
|
||||
len += head;
|
||||
}
|
||||
len = PAGE_ALIGN(len);
|
||||
__mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
static void mtd_put_device(struct super_block *sb)
|
||||
{
|
||||
put_mtd_device(logfs_super(sb)->s_mtd);
|
||||
}
|
||||
|
||||
static const struct logfs_device_ops mtd_devops = {
|
||||
.find_first_sb = mtd_find_first_sb,
|
||||
.find_last_sb = mtd_find_last_sb,
|
||||
.readpage = mtd_readpage,
|
||||
.writeseg = mtd_writeseg,
|
||||
.erase = mtd_erase,
|
||||
.sync = mtd_sync,
|
||||
.put_device = mtd_put_device,
|
||||
};
|
||||
|
||||
int logfs_get_sb_mtd(struct file_system_type *type, int flags,
|
||||
int mtdnr, struct vfsmount *mnt)
|
||||
{
|
||||
struct mtd_info *mtd;
|
||||
const struct logfs_device_ops *devops = &mtd_devops;
|
||||
|
||||
mtd = get_mtd_device(NULL, mtdnr);
|
||||
return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
|
||||
}
|
||||
+818
File diff suppressed because it is too large
Load Diff
+263
@@ -0,0 +1,263 @@
|
||||
/*
|
||||
* fs/logfs/file.c - prepare_write, commit_write and friends
|
||||
*
|
||||
* As should be obvious for Linux kernel code, license is GPLv2
|
||||
*
|
||||
* Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
|
||||
*/
|
||||
#include "logfs.h"
|
||||
#include <linux/sched.h>
|
||||
#include <linux/writeback.h>
|
||||
|
||||
static int logfs_write_begin(struct file *file, struct address_space *mapping,
|
||||
loff_t pos, unsigned len, unsigned flags,
|
||||
struct page **pagep, void **fsdata)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
struct page *page;
|
||||
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
|
||||
|
||||
page = grab_cache_page_write_begin(mapping, index, flags);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
*pagep = page;
|
||||
|
||||
if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
|
||||
return 0;
|
||||
if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
|
||||
unsigned start = pos & (PAGE_CACHE_SIZE - 1);
|
||||
unsigned end = start + len;
|
||||
|
||||
/* Reading beyond i_size is simple: memset to zero */
|
||||
zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
|
||||
return 0;
|
||||
}
|
||||
return logfs_readpage_nolock(page);
|
||||
}
|
||||
|
||||
static int logfs_write_end(struct file *file, struct address_space *mapping,
|
||||
loff_t pos, unsigned len, unsigned copied, struct page *page,
|
||||
void *fsdata)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
pgoff_t index = page->index;
|
||||
unsigned start = pos & (PAGE_CACHE_SIZE - 1);
|
||||
unsigned end = start + copied;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
|
||||
BUG_ON(page->index > I3_BLOCKS);
|
||||
|
||||
if (copied < len) {
|
||||
/*
|
||||
* Short write of a non-initialized paged. Just tell userspace
|
||||
* to retry the entire page.
|
||||
*/
|
||||
if (!PageUptodate(page)) {
|
||||
copied = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (copied == 0)
|
||||
goto out; /* FIXME: do we need to update inode? */
|
||||
|
||||
if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
|
||||
i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
|
||||
mark_inode_dirty_sync(inode);
|
||||
}
|
||||
|
||||
SetPageUptodate(page);
|
||||
if (!PageDirty(page)) {
|
||||
if (!get_page_reserve(inode, page))
|
||||
__set_page_dirty_nobuffers(page);
|
||||
else
|
||||
ret = logfs_write_buf(inode, page, WF_LOCK);
|
||||
}
|
||||
out:
|
||||
unlock_page(page);
|
||||
page_cache_release(page);
|
||||
return ret ? ret : copied;
|
||||
}
|
||||
|
||||
int logfs_readpage(struct file *file, struct page *page)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = logfs_readpage_nolock(page);
|
||||
unlock_page(page);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Clear the page's dirty flag in the radix tree. */
|
||||
/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
|
||||
* the dirty bit from the radix tree for filesystems that don't have to wait
|
||||
* for page writeback to finish (i.e. any compressing filesystem).
|
||||
*/
|
||||
static void clear_radix_tree_dirty(struct page *page)
|
||||
{
|
||||
BUG_ON(PagePrivate(page) || page->private);
|
||||
set_page_writeback(page);
|
||||
end_page_writeback(page);
|
||||
}
|
||||
|
||||
static int __logfs_writepage(struct page *page)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
int err;
|
||||
|
||||
err = logfs_write_buf(inode, page, WF_LOCK);
|
||||
if (err)
|
||||
set_page_dirty(page);
|
||||
else
|
||||
clear_radix_tree_dirty(page);
|
||||
unlock_page(page);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int logfs_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
struct inode *inode = page->mapping->host;
|
||||
loff_t i_size = i_size_read(inode);
|
||||
pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
|
||||
unsigned offset;
|
||||
u64 bix;
|
||||
level_t level;
|
||||
|
||||
log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
|
||||
page);
|
||||
|
||||
logfs_unpack_index(page->index, &bix, &level);
|
||||
|
||||
/* Indirect blocks are never truncated */
|
||||
if (level != 0)
|
||||
return __logfs_writepage(page);
|
||||
|
||||
/*
|
||||
* TODO: everything below is a near-verbatim copy of nobh_writepage().
|
||||
* The relevant bits should be factored out after logfs is merged.
|
||||
*/
|
||||
|
||||
/* Is the page fully inside i_size? */
|
||||
if (bix < end_index)
|
||||
return __logfs_writepage(page);
|
||||
|
||||
/* Is the page fully outside i_size? (truncate in progress) */
|
||||
offset = i_size & (PAGE_CACHE_SIZE-1);
|
||||
if (bix > end_index || offset == 0) {
|
||||
unlock_page(page);
|
||||
return 0; /* don't care */
|
||||
}
|
||||
|
||||
/*
|
||||
* The page straddles i_size. It must be zeroed out on each and every
|
||||
* writepage invokation because it may be mmapped. "A file is mapped
|
||||
* in multiples of the page size. For a file that is not a multiple of
|
||||
* the page size, the remaining memory is zeroed when mapped, and
|
||||
* writes to that region are not written out to the file."
|
||||
*/
|
||||
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
|
||||
return __logfs_writepage(page);
|
||||
}
|
||||
|
||||
static void logfs_invalidatepage(struct page *page, unsigned long offset)
|
||||
{
|
||||
move_page_to_btree(page);
|
||||
BUG_ON(PagePrivate(page) || page->private);
|
||||
}
|
||||
|
||||
static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
|
||||
{
|
||||
return 0; /* None of these are easy to release */
|
||||
}
|
||||
|
||||
|
||||
int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
|
||||
unsigned long arg)
|
||||
{
|
||||
struct logfs_inode *li = logfs_inode(inode);
|
||||
unsigned int oldflags, flags;
|
||||
int err;
|
||||
|
||||
switch (cmd) {
|
||||
case FS_IOC_GETFLAGS:
|
||||
flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
|
||||
return put_user(flags, (int __user *)arg);
|
||||
case FS_IOC_SETFLAGS:
|
||||
if (IS_RDONLY(inode))
|
||||
return -EROFS;
|
||||
|
||||
if (!is_owner_or_cap(inode))
|
||||
return -EACCES;
|
||||
|
||||
err = get_user(flags, (int __user *)arg);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
mutex_lock(&inode->i_mutex);
|
||||
oldflags = li->li_flags;
|
||||
flags &= LOGFS_FL_USER_MODIFIABLE;
|
||||
flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
|
||||
li->li_flags = flags;
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
mark_inode_dirty_sync(inode);
|
||||
return 0;
|
||||
|
||||
default:
|
||||
return -ENOTTY;
|
||||
}
|
||||
}
|
||||
|
||||
int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
|
||||
{
|
||||
struct super_block *sb = dentry->d_inode->i_sb;
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
|
||||
/* FIXME: write anchor */
|
||||
super->s_devops->sync(sb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
struct inode *inode = dentry->d_inode;
|
||||
int err = 0;
|
||||
|
||||
if (attr->ia_valid & ATTR_SIZE)
|
||||
err = logfs_truncate(inode, attr->ia_size);
|
||||
attr->ia_valid &= ~ATTR_SIZE;
|
||||
|
||||
if (!err)
|
||||
err = inode_change_ok(inode, attr);
|
||||
if (!err)
|
||||
err = inode_setattr(inode, attr);
|
||||
return err;
|
||||
}
|
||||
|
||||
const struct inode_operations logfs_reg_iops = {
|
||||
.setattr = logfs_setattr,
|
||||
};
|
||||
|
||||
const struct file_operations logfs_reg_fops = {
|
||||
.aio_read = generic_file_aio_read,
|
||||
.aio_write = generic_file_aio_write,
|
||||
.fsync = logfs_fsync,
|
||||
.ioctl = logfs_ioctl,
|
||||
.llseek = generic_file_llseek,
|
||||
.mmap = generic_file_readonly_mmap,
|
||||
.open = generic_file_open,
|
||||
.read = do_sync_read,
|
||||
.write = do_sync_write,
|
||||
};
|
||||
|
||||
const struct address_space_operations logfs_reg_aops = {
|
||||
.invalidatepage = logfs_invalidatepage,
|
||||
.readpage = logfs_readpage,
|
||||
.releasepage = logfs_releasepage,
|
||||
.set_page_dirty = __set_page_dirty_nobuffers,
|
||||
.writepage = logfs_writepage,
|
||||
.writepages = generic_writepages,
|
||||
.write_begin = logfs_write_begin,
|
||||
.write_end = logfs_write_end,
|
||||
};
|
||||
+730
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,417 @@
|
||||
/*
|
||||
* fs/logfs/inode.c - inode handling code
|
||||
*
|
||||
* As should be obvious for Linux kernel code, license is GPLv2
|
||||
*
|
||||
* Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
|
||||
*/
|
||||
#include "logfs.h"
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/backing-dev.h>
|
||||
|
||||
/*
|
||||
* How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
|
||||
* on the medium. It therefore also lacks a method to store the previous
|
||||
* generation number for deleted inodes. Instead a single generation number
|
||||
* is stored which will be used for new inodes. Being just a 32bit counter,
|
||||
* this can obvious wrap relatively quickly. So we only reuse inodes if we
|
||||
* know that a fair number of inodes can be created before we have to increment
|
||||
* the generation again - effectively adding some bits to the counter.
|
||||
* But being too aggressive here means we keep a very large and very sparse
|
||||
* inode file, wasting space on indirect blocks.
|
||||
* So what is a good value? Beats me. 64k seems moderately bad on both
|
||||
* fronts, so let's use that for now...
|
||||
*
|
||||
* NFS sucks, as everyone already knows.
|
||||
*/
|
||||
#define INOS_PER_WRAP (0x10000)
|
||||
|
||||
/*
|
||||
* Logfs' requirement to read inodes for garbage collection makes life a bit
|
||||
* harder. GC may have to read inodes that are in I_FREEING state, when they
|
||||
* are being written out - and waiting for GC to make progress, naturally.
|
||||
*
|
||||
* So we cannot just call iget() or some variant of it, but first have to check
|
||||
* wether the inode in question might be in I_FREEING state. Therefore we
|
||||
* maintain our own per-sb list of "almost deleted" inodes and check against
|
||||
* that list first. Normally this should be at most 1-2 entries long.
|
||||
*
|
||||
* Also, inodes have logfs-specific reference counting on top of what the vfs
|
||||
* does. When .destroy_inode is called, normally the reference count will drop
|
||||
* to zero and the inode gets deleted. But if GC accessed the inode, its
|
||||
* refcount will remain nonzero and final deletion will have to wait.
|
||||
*
|
||||
* As a result we have two sets of functions to get/put inodes:
|
||||
* logfs_safe_iget/logfs_safe_iput - safe to call from GC context
|
||||
* logfs_iget/iput - normal version
|
||||
*/
|
||||
static struct kmem_cache *logfs_inode_cache;
|
||||
|
||||
static DEFINE_SPINLOCK(logfs_inode_lock);
|
||||
|
||||
static void logfs_inode_setops(struct inode *inode)
|
||||
{
|
||||
switch (inode->i_mode & S_IFMT) {
|
||||
case S_IFDIR:
|
||||
inode->i_op = &logfs_dir_iops;
|
||||
inode->i_fop = &logfs_dir_fops;
|
||||
inode->i_mapping->a_ops = &logfs_reg_aops;
|
||||
break;
|
||||
case S_IFREG:
|
||||
inode->i_op = &logfs_reg_iops;
|
||||
inode->i_fop = &logfs_reg_fops;
|
||||
inode->i_mapping->a_ops = &logfs_reg_aops;
|
||||
break;
|
||||
case S_IFLNK:
|
||||
inode->i_op = &logfs_symlink_iops;
|
||||
inode->i_mapping->a_ops = &logfs_reg_aops;
|
||||
break;
|
||||
case S_IFSOCK: /* fall through */
|
||||
case S_IFBLK: /* fall through */
|
||||
case S_IFCHR: /* fall through */
|
||||
case S_IFIFO:
|
||||
init_special_inode(inode, inode->i_mode, inode->i_rdev);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
|
||||
{
|
||||
struct inode *inode = iget_locked(sb, ino);
|
||||
int err;
|
||||
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
if (!(inode->i_state & I_NEW))
|
||||
return inode;
|
||||
|
||||
err = logfs_read_inode(inode);
|
||||
if (err || inode->i_nlink == 0) {
|
||||
/* inode->i_nlink == 0 can be true when called from
|
||||
* block validator */
|
||||
/* set i_nlink to 0 to prevent caching */
|
||||
inode->i_nlink = 0;
|
||||
logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
|
||||
iget_failed(inode);
|
||||
if (!err)
|
||||
err = -ENOENT;
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
logfs_inode_setops(inode);
|
||||
unlock_new_inode(inode);
|
||||
return inode;
|
||||
}
|
||||
|
||||
struct inode *logfs_iget(struct super_block *sb, ino_t ino)
|
||||
{
|
||||
BUG_ON(ino == LOGFS_INO_MASTER);
|
||||
BUG_ON(ino == LOGFS_INO_SEGFILE);
|
||||
return __logfs_iget(sb, ino);
|
||||
}
|
||||
|
||||
/*
|
||||
* is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
|
||||
* this allows logfs_iput to do the right thing later
|
||||
*/
|
||||
struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
struct logfs_inode *li;
|
||||
|
||||
if (ino == LOGFS_INO_MASTER)
|
||||
return super->s_master_inode;
|
||||
if (ino == LOGFS_INO_SEGFILE)
|
||||
return super->s_segfile_inode;
|
||||
|
||||
spin_lock(&logfs_inode_lock);
|
||||
list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
|
||||
if (li->vfs_inode.i_ino == ino) {
|
||||
li->li_refcount++;
|
||||
spin_unlock(&logfs_inode_lock);
|
||||
*is_cached = 1;
|
||||
return &li->vfs_inode;
|
||||
}
|
||||
spin_unlock(&logfs_inode_lock);
|
||||
|
||||
*is_cached = 0;
|
||||
return __logfs_iget(sb, ino);
|
||||
}
|
||||
|
||||
static void __logfs_destroy_inode(struct inode *inode)
|
||||
{
|
||||
struct logfs_inode *li = logfs_inode(inode);
|
||||
|
||||
BUG_ON(li->li_block);
|
||||
list_del(&li->li_freeing_list);
|
||||
kmem_cache_free(logfs_inode_cache, li);
|
||||
}
|
||||
|
||||
static void logfs_destroy_inode(struct inode *inode)
|
||||
{
|
||||
struct logfs_inode *li = logfs_inode(inode);
|
||||
|
||||
BUG_ON(list_empty(&li->li_freeing_list));
|
||||
spin_lock(&logfs_inode_lock);
|
||||
li->li_refcount--;
|
||||
if (li->li_refcount == 0)
|
||||
__logfs_destroy_inode(inode);
|
||||
spin_unlock(&logfs_inode_lock);
|
||||
}
|
||||
|
||||
void logfs_safe_iput(struct inode *inode, int is_cached)
|
||||
{
|
||||
if (inode->i_ino == LOGFS_INO_MASTER)
|
||||
return;
|
||||
if (inode->i_ino == LOGFS_INO_SEGFILE)
|
||||
return;
|
||||
|
||||
if (is_cached) {
|
||||
logfs_destroy_inode(inode);
|
||||
return;
|
||||
}
|
||||
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
static void logfs_init_inode(struct super_block *sb, struct inode *inode)
|
||||
{
|
||||
struct logfs_inode *li = logfs_inode(inode);
|
||||
int i;
|
||||
|
||||
li->li_flags = 0;
|
||||
li->li_height = 0;
|
||||
li->li_used_bytes = 0;
|
||||
li->li_block = NULL;
|
||||
inode->i_uid = 0;
|
||||
inode->i_gid = 0;
|
||||
inode->i_size = 0;
|
||||
inode->i_blocks = 0;
|
||||
inode->i_ctime = CURRENT_TIME;
|
||||
inode->i_mtime = CURRENT_TIME;
|
||||
inode->i_nlink = 1;
|
||||
INIT_LIST_HEAD(&li->li_freeing_list);
|
||||
|
||||
for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
|
||||
li->li_data[i] = 0;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
static struct inode *logfs_alloc_inode(struct super_block *sb)
|
||||
{
|
||||
struct logfs_inode *li;
|
||||
|
||||
li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
|
||||
if (!li)
|
||||
return NULL;
|
||||
logfs_init_inode(sb, &li->vfs_inode);
|
||||
return &li->vfs_inode;
|
||||
}
|
||||
|
||||
/*
|
||||
* In logfs inodes are written to an inode file. The inode file, like any
|
||||
* other file, is managed with a inode. The inode file's inode, aka master
|
||||
* inode, requires special handling in several respects. First, it cannot be
|
||||
* written to the inode file, so it is stored in the journal instead.
|
||||
*
|
||||
* Secondly, this inode cannot be written back and destroyed before all other
|
||||
* inodes have been written. The ordering is important. Linux' VFS is happily
|
||||
* unaware of the ordering constraint and would ordinarily destroy the master
|
||||
* inode at umount time while other inodes are still in use and dirty. Not
|
||||
* good.
|
||||
*
|
||||
* So logfs makes sure the master inode is not written until all other inodes
|
||||
* have been destroyed. Sadly, this method has another side-effect. The VFS
|
||||
* will notice one remaining inode and print a frightening warning message.
|
||||
* Worse, it is impossible to judge whether such a warning was caused by the
|
||||
* master inode or any other inodes have leaked as well.
|
||||
*
|
||||
* Our attempt of solving this is with logfs_new_meta_inode() below. Its
|
||||
* purpose is to create a new inode that will not trigger the warning if such
|
||||
* an inode is still in use. An ugly hack, no doubt. Suggections for
|
||||
* improvement are welcome.
|
||||
*/
|
||||
struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = logfs_alloc_inode(sb);
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
inode->i_mode = S_IFREG;
|
||||
inode->i_ino = ino;
|
||||
inode->i_sb = sb;
|
||||
|
||||
/* This is a blatant copy of alloc_inode code. We'd need alloc_inode
|
||||
* to be nonstatic, alas. */
|
||||
{
|
||||
struct address_space * const mapping = &inode->i_data;
|
||||
|
||||
mapping->a_ops = &logfs_reg_aops;
|
||||
mapping->host = inode;
|
||||
mapping->flags = 0;
|
||||
mapping_set_gfp_mask(mapping, GFP_NOFS);
|
||||
mapping->assoc_mapping = NULL;
|
||||
mapping->backing_dev_info = &default_backing_dev_info;
|
||||
inode->i_mapping = mapping;
|
||||
inode->i_nlink = 1;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
|
||||
{
|
||||
struct inode *inode;
|
||||
int err;
|
||||
|
||||
inode = logfs_new_meta_inode(sb, ino);
|
||||
if (IS_ERR(inode))
|
||||
return inode;
|
||||
|
||||
err = logfs_read_inode(inode);
|
||||
if (err) {
|
||||
destroy_meta_inode(inode);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
logfs_inode_setops(inode);
|
||||
return inode;
|
||||
}
|
||||
|
||||
static int logfs_write_inode(struct inode *inode, int do_sync)
|
||||
{
|
||||
int ret;
|
||||
long flags = WF_LOCK;
|
||||
|
||||
/* Can only happen if creat() failed. Safe to skip. */
|
||||
if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
|
||||
return 0;
|
||||
|
||||
ret = __logfs_write_inode(inode, flags);
|
||||
LOGFS_BUG_ON(ret, inode->i_sb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void destroy_meta_inode(struct inode *inode)
|
||||
{
|
||||
if (inode) {
|
||||
if (inode->i_data.nrpages)
|
||||
truncate_inode_pages(&inode->i_data, 0);
|
||||
logfs_clear_inode(inode);
|
||||
kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
|
||||
}
|
||||
}
|
||||
|
||||
/* called with inode_lock held */
|
||||
static void logfs_drop_inode(struct inode *inode)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(inode->i_sb);
|
||||
struct logfs_inode *li = logfs_inode(inode);
|
||||
|
||||
spin_lock(&logfs_inode_lock);
|
||||
list_move(&li->li_freeing_list, &super->s_freeing_list);
|
||||
spin_unlock(&logfs_inode_lock);
|
||||
generic_drop_inode(inode);
|
||||
}
|
||||
|
||||
static void logfs_set_ino_generation(struct super_block *sb,
|
||||
struct inode *inode)
|
||||
{
|
||||
struct logfs_super *super = logfs_super(sb);
|
||||
u64 ino;
|
||||
|
||||
mutex_lock(&super->s_journal_mutex);
|
||||
ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
|
||||
super->s_last_ino = ino;
|
||||
super->s_inos_till_wrap--;
|
||||
if (super->s_inos_till_wrap < 0) {
|
||||
super->s_last_ino = LOGFS_RESERVED_INOS;
|
||||
super->s_generation++;
|
||||
super->s_inos_till_wrap = INOS_PER_WRAP;
|
||||
}
|
||||
inode->i_ino = ino;
|
||||
inode->i_generation = super->s_generation;
|
||||
mutex_unlock(&super->s_journal_mutex);
|
||||
}
|
||||
|
||||
struct inode *logfs_new_inode(struct inode *dir, int mode)
|
||||
{
|
||||
struct super_block *sb = dir->i_sb;
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
logfs_init_inode(sb, inode);
|
||||
|
||||
/* inherit parent flags */
|
||||
logfs_inode(inode)->li_flags |=
|
||||
logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
|
||||
|
||||
inode->i_mode = mode;
|
||||
logfs_set_ino_generation(sb, inode);
|
||||
|
||||
inode->i_uid = current_fsuid();
|
||||
inode->i_gid = current_fsgid();
|
||||
if (dir->i_mode & S_ISGID) {
|
||||
inode->i_gid = dir->i_gid;
|
||||
if (S_ISDIR(mode))
|
||||
inode->i_mode |= S_ISGID;
|
||||
}
|
||||
|
||||
logfs_inode_setops(inode);
|
||||
insert_inode_hash(inode);
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
static void logfs_init_once(void *_li)
|
||||
{
|
||||
struct logfs_inode *li = _li;
|
||||
int i;
|
||||
|
||||
li->li_flags = 0;
|
||||
li->li_used_bytes = 0;
|
||||
li->li_refcount = 1;
|
||||
for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
|
||||
li->li_data[i] = 0;
|
||||
inode_init_once(&li->vfs_inode);
|
||||
}
|
||||
|
||||
static int logfs_sync_fs(struct super_block *sb, int wait)
|
||||
{
|
||||
/* FIXME: write anchor */
|
||||
logfs_super(sb)->s_devops->sync(sb);
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct super_operations logfs_super_operations = {
|
||||
.alloc_inode = logfs_alloc_inode,
|
||||
.clear_inode = logfs_clear_inode,
|
||||
.delete_inode = logfs_delete_inode,
|
||||
.destroy_inode = logfs_destroy_inode,
|
||||
.drop_inode = logfs_drop_inode,
|
||||
.write_inode = logfs_write_inode,
|
||||
.statfs = logfs_statfs,
|
||||
.sync_fs = logfs_sync_fs,
|
||||
};
|
||||
|
||||
int logfs_init_inode_cache(void)
|
||||
{
|
||||
logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
|
||||
sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
|
||||
logfs_init_once);
|
||||
if (!logfs_inode_cache)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void logfs_destroy_inode_cache(void)
|
||||
{
|
||||
kmem_cache_destroy(logfs_inode_cache);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,109 @@
|
||||
extern struct btree_geo btree_geo128;
|
||||
|
||||
struct btree_head128 { struct btree_head h; };
|
||||
|
||||
static inline void btree_init_mempool128(struct btree_head128 *head,
|
||||
mempool_t *mempool)
|
||||
{
|
||||
btree_init_mempool(&head->h, mempool);
|
||||
}
|
||||
|
||||
static inline int btree_init128(struct btree_head128 *head)
|
||||
{
|
||||
return btree_init(&head->h);
|
||||
}
|
||||
|
||||
static inline void btree_destroy128(struct btree_head128 *head)
|
||||
{
|
||||
btree_destroy(&head->h);
|
||||
}
|
||||
|
||||
static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2)
|
||||
{
|
||||
u64 key[2] = {k1, k2};
|
||||
return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key);
|
||||
}
|
||||
|
||||
static inline void *btree_get_prev128(struct btree_head128 *head,
|
||||
u64 *k1, u64 *k2)
|
||||
{
|
||||
u64 key[2] = {*k1, *k2};
|
||||
void *val;
|
||||
|
||||
val = btree_get_prev(&head->h, &btree_geo128,
|
||||
(unsigned long *)&key);
|
||||
*k1 = key[0];
|
||||
*k2 = key[1];
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2,
|
||||
void *val, gfp_t gfp)
|
||||
{
|
||||
u64 key[2] = {k1, k2};
|
||||
return btree_insert(&head->h, &btree_geo128,
|
||||
(unsigned long *)&key, val, gfp);
|
||||
}
|
||||
|
||||
static inline int btree_update128(struct btree_head128 *head, u64 k1, u64 k2,
|
||||
void *val)
|
||||
{
|
||||
u64 key[2] = {k1, k2};
|
||||
return btree_update(&head->h, &btree_geo128,
|
||||
(unsigned long *)&key, val);
|
||||
}
|
||||
|
||||
static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2)
|
||||
{
|
||||
u64 key[2] = {k1, k2};
|
||||
return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key);
|
||||
}
|
||||
|
||||
static inline void *btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2)
|
||||
{
|
||||
u64 key[2];
|
||||
void *val;
|
||||
|
||||
val = btree_last(&head->h, &btree_geo128, (unsigned long *)&key[0]);
|
||||
if (val) {
|
||||
*k1 = key[0];
|
||||
*k2 = key[1];
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
static inline int btree_merge128(struct btree_head128 *target,
|
||||
struct btree_head128 *victim,
|
||||
gfp_t gfp)
|
||||
{
|
||||
return btree_merge(&target->h, &victim->h, &btree_geo128, gfp);
|
||||
}
|
||||
|
||||
void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
|
||||
size_t index, void *__func);
|
||||
|
||||
typedef void (*visitor128_t)(void *elem, unsigned long opaque,
|
||||
u64 key1, u64 key2, size_t index);
|
||||
|
||||
static inline size_t btree_visitor128(struct btree_head128 *head,
|
||||
unsigned long opaque,
|
||||
visitor128_t func2)
|
||||
{
|
||||
return btree_visitor(&head->h, &btree_geo128, opaque,
|
||||
visitor128, func2);
|
||||
}
|
||||
|
||||
static inline size_t btree_grim_visitor128(struct btree_head128 *head,
|
||||
unsigned long opaque,
|
||||
visitor128_t func2)
|
||||
{
|
||||
return btree_grim_visitor(&head->h, &btree_geo128, opaque,
|
||||
visitor128, func2);
|
||||
}
|
||||
|
||||
#define btree_for_each_safe128(head, k1, k2, val) \
|
||||
for (val = btree_last128(head, &k1, &k2); \
|
||||
val; \
|
||||
val = btree_get_prev128(head, &k1, &k2))
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user