Merge pull request #22998 from DaanDeMeyer/journal-compact-split

journal: Add compact mode
This commit is contained in:
Luca Boccassi
2022-10-07 16:19:03 +02:00
committed by GitHub
20 changed files with 701 additions and 430 deletions

View File

@@ -468,3 +468,10 @@ SYSTEMD_HOME_DEBUG_SUFFIX=foo \
when kernel-install is invoked. This can be useful if kernel-install is invoked
unconditionally as a child process by another tool, such as package managers
running kernel-install in a postinstall script.
`systemd-journald`:
* `$SYSTEMD_JOURNAL_COMPACT` - Takes a boolean. If enabled, journal files are written
in a more compact format that reduces the amount of disk space required by the
journal. Note that journal files in compact mode are limited to 4G to allow use of
32-bit offsets. Enabled by default.

View File

@@ -71,7 +71,7 @@ thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054
## Basics
* All offsets, sizes, time values, hashes (and most other numeric values) are 64bit unsigned integers in LE format.
* All offsets, sizes, time values, hashes (and most other numeric values) are 32bit/64bit unsigned integers in LE format.
* Offsets are always relative to the beginning of the file.
* The 64bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32bit integer it returns as higher 32bit part of the 64bit value, and the second one uses as lower 32bit part.
* All structures are aligned to 64bit boundaries and padded to multiples of 64bit
@@ -177,6 +177,9 @@ _packed_ struct Header {
/* Added in 246 */
le64_t data_hash_chain_depth;
le64_t field_hash_chain_depth;
/* Added in 252 */
le32_t tail_entry_array_offset; \
le32_t tail_entry_array_n_entries; \
};
```
@@ -231,6 +234,8 @@ became too frequent.
Similar, **field_hash_chain_depth** is a counter of the deepest chain in the
field hash table, minus one.
**tail_entry_array_offset** and **tail_entry_array_n_entries** allow immediate
access to the last entry array in the global entry array chain.
## Extensibility
@@ -259,6 +264,7 @@ enum {
HEADER_INCOMPATIBLE_COMPRESSED_LZ4 = 1 << 1,
HEADER_INCOMPATIBLE_KEYED_HASH = 1 << 2,
HEADER_INCOMPATIBLE_COMPRESSED_ZSTD = 1 << 3,
HEADER_INCOMPATIBLE_COMPACT = 1 << 4,
};
enum {
@@ -276,6 +282,9 @@ HEADER_INCOMPATIBLE_KEYED_HASH indicates that instead of the unkeyed Jenkins
hash function the keyed siphash24 hash function is used for the two hash
tables, see below.
HEADER_INCOMPATIBLE_COMPACT indicates that the journal file uses the new binary
format that uses less space on disk compared to the original format.
HEADER_COMPATIBLE_SEALED indicates that the file includes TAG objects required
for Forward Secure Sealing.
@@ -393,7 +402,16 @@ _packed_ struct DataObject {
le64_t entry_offset; /* the first array entry we store inline */
le64_t entry_array_offset;
le64_t n_entries;
uint8_t payload[];
union { \
struct { \
uint8_t payload[] ; \
} regular; \
struct { \
le32_t tail_entry_array_offset; \
le32_t tail_entry_array_n_entries; \
uint8_t payload[]; \
} compact; \
}; \
};
```
@@ -426,6 +444,9 @@ OBJECT_COMPRESSED_XZ/OBJECT_COMPRESSED_LZ4/OBJECT_COMPRESSED_ZSTD is set in the
`ObjectHeader`, in which case the payload is compressed with the indicated
compression algorithm.
If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, Two extra fields are stored to
allow immediate access to the tail entry array in the DATA object's entry array
chain.
## Field Objects
@@ -457,11 +478,6 @@ field name. It is the head of a singly linked list using DATA's
## Entry Objects
```
_packed_ struct EntryItem {
le64_t object_offset;
le64_t hash;
};
_packed_ struct EntryObject {
ObjectHeader object;
le64_t seqnum;
@@ -469,7 +485,15 @@ _packed_ struct EntryObject {
le64_t monotonic;
sd_id128_t boot_id;
le64_t xor_hash;
EntryItem items[];
union { \
struct { \
le64_t object_offset; \
le64_t hash; \
} regular[]; \
struct { \
le32_t object_offset; \
} compact[]; \
} items; \
};
```
@@ -495,6 +519,10 @@ The **items[]** array contains references to all DATA objects of this entry,
plus their respective hashes (which are calculated the same way as in the DATA
objects, i.e. keyed by the file ID).
If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, DATA object offsets are stored
as 32-bit integers instead of 64bit and the unused hash field per data object is
not stored anymore.
In the file ENTRY objects are written ordered monotonically by sequence
number. For continuous parts of the file written during the same boot
(i.e. with the same boot_id) the monotonic timestamp is monotonic too. Modulo
@@ -548,7 +576,10 @@ creativity rather than runtime parameters.
_packed_ struct EntryArrayObject {
ObjectHeader object;
le64_t next_entry_array_offset;
le64_t items[];
union {
le64_t regular[];
le32_t compact[];
} items;
};
```
@@ -556,6 +587,9 @@ Entry Arrays are used to store a sorted array of offsets to entries. Entry
arrays are strictly sorted by offsets on disk, and hence by their timestamps
and sequence numbers (with some restrictions, see above).
If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, offsets are stored as 32-bit
integers instead of 64bit.
Entry Arrays are chained up. If one entry array is full another one is
allocated and the **next_entry_array_offset** field of the old one pointed to
it. An Entry Array with **next_entry_array_offset** set to 0 is the last in the

View File

@@ -854,6 +854,17 @@
cryptographic theory it is based on.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--convert=</option></term>
<listitem><para>Converts the specified journal files to the latest supported journal format. Takes
the path to store the converted journal files. The path should include the filename to be used for
the converted files, with the <literal>.journal</literal> extension (e.g.
<filename>/a/b/c/converted.journal</filename> will store the journal files in the
<filename>/a/b/c</filename> directory using <filename>converted.journal</filename> as the filename).
</para></listitem>
</varlistentry>
<xi:include href="standard-options.xml" xpointer="help" />
<xi:include href="standard-options.xml" xpointer="version" />
</variablelist>

View File

@@ -2293,11 +2293,13 @@ public_programs += executable(
install : true)
if get_option('link-journalctl-shared')
journalctl_link_with = [libshared]
journalctl_link_with = [libshared,
libjournal_core]
else
journalctl_link_with = [libsystemd_static,
libshared_static,
libbasic_gcrypt]
libbasic_gcrypt,
libjournal_core]
endif
public_programs += executable(

View File

@@ -44,6 +44,7 @@
#include "locale-util.h"
#include "log.h"
#include "logs-show.h"
#include "managed-journal-file.h"
#include "memory-util.h"
#include "mkdir.h"
#include "mount-util.h"
@@ -128,6 +129,7 @@ static uint64_t arg_vacuum_size = 0;
static uint64_t arg_vacuum_n_files = 0;
static usec_t arg_vacuum_time = 0;
static char **arg_output_fields = NULL;
static const char *arg_convert = NULL;
static const char *arg_pattern = NULL;
static pcre2_code *arg_compiled_pattern = NULL;
static PatternCompileCase arg_case = PATTERN_COMPILE_CASE_AUTO;
@@ -162,6 +164,7 @@ static enum {
ACTION_ROTATE_AND_VACUUM,
ACTION_LIST_FIELDS,
ACTION_LIST_FIELD_NAMES,
ACTION_CONVERT,
} arg_action = ACTION_SHOW;
typedef struct BootId {
@@ -387,6 +390,7 @@ static int help(void) {
" --dump-catalog Show entries in the message catalog\n"
" --update-catalog Update the message catalog database\n"
" --setup-keys Generate a new FSS key pair\n"
" --convert=PATH Convert the journal to the latest journal format\n"
"\nSee the %2$s for details.\n",
program_invocation_short_name,
link,
@@ -441,6 +445,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_NO_HOSTNAME,
ARG_OUTPUT_FIELDS,
ARG_NAMESPACE,
ARG_CONVERT,
};
static const struct option options[] = {
@@ -508,6 +513,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "no-hostname", no_argument, NULL, ARG_NO_HOSTNAME },
{ "output-fields", required_argument, NULL, ARG_OUTPUT_FIELDS },
{ "namespace", required_argument, NULL, ARG_NAMESPACE },
{ "convert", required_argument, NULL, ARG_CONVERT },
{}
};
@@ -1034,6 +1040,11 @@ static int parse_argv(int argc, char *argv[]) {
break;
}
case ARG_CONVERT:
arg_action = ACTION_CONVERT;
arg_convert = optarg;
break;
case '?':
return -EINVAL;
@@ -2093,6 +2104,52 @@ static int wait_for_change(sd_journal *j, int poll_fd) {
return 0;
}
static int journal_convert(sd_journal *j) {
_cleanup_(managed_journal_file_closep) ManagedJournalFile *to = NULL;
_cleanup_(mmap_cache_unrefp) MMapCache *mmap = NULL;
int r;
assert(arg_convert);
mmap = mmap_cache_new();
if (!mmap)
return -ENOMEM;
r = managed_journal_file_open(-1, arg_convert, O_RDWR | O_CREAT, JOURNAL_COMPRESS, 0640, UINT64_MAX,
&(JournalMetrics) { -1, -1, -1, -1, -1, -1 }, mmap, NULL, NULL, &to);
if (r < 0)
return log_error_errno(r, "Failed to open journal: %m");
SD_JOURNAL_FOREACH(j) {
Object *o;
JournalFile *from;
from = j->current_file;
assert(from && from->current_offset > 0);
r = journal_file_move_to_object(from, OBJECT_ENTRY, from->current_offset, &o);
if (r < 0)
return log_error_errno(r, "Can't read entry: %m");
r = journal_file_copy_entry(from, to->file, o, from->current_offset);
if (r >= 0)
continue;
if (!journal_shall_try_append_again(to->file, r))
return log_error_errno(r, "Can't write entry: %m");
r = managed_journal_file_rotate(&to, mmap, JOURNAL_COMPRESS, UINT64_MAX, NULL);
if (r < 0)
return r;
r = journal_file_copy_entry(from, to->file, o, from->current_offset);
if (r < 0)
return log_error_errno(r, "Can't write entry: %m");
}
return 0;
}
int main(int argc, char *argv[]) {
_cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
_cleanup_(umount_and_rmdir_and_freep) char *unlink_dir = NULL;
@@ -2203,6 +2260,7 @@ int main(int argc, char *argv[]) {
case ACTION_ROTATE_AND_VACUUM:
case ACTION_LIST_FIELDS:
case ACTION_LIST_FIELD_NAMES:
case ACTION_CONVERT:
/* These ones require access to the journal files, continue below. */
break;
@@ -2357,6 +2415,10 @@ int main(int argc, char *argv[]) {
case ACTION_LIST_FIELDS:
break;
case ACTION_CONVERT:
r = journal_convert(j);
goto finish;
default:
assert_not_reached();
}

View File

@@ -30,6 +30,7 @@
#include "io-util.h"
#include "journal-authenticate.h"
#include "journal-internal.h"
#include "journal-util.h"
#include "journal-vacuum.h"
#include "journald-audit.h"
#include "journald-context.h"
@@ -769,55 +770,6 @@ static void server_cache_hostname(Server *s) {
free_and_replace(s->hostname_field, x);
}
static bool shall_try_append_again(JournalFile *f, int r) {
switch (r) {
case -E2BIG: /* Hit configured limit */
case -EFBIG: /* Hit fs limit */
case -EDQUOT: /* Quota limit hit */
case -ENOSPC: /* Disk full */
log_debug("%s: Allocation limit reached, rotating.", f->path);
return true;
case -EIO: /* I/O error of some kind (mmap) */
log_warning("%s: IO error, rotating.", f->path);
return true;
case -EHOSTDOWN: /* Other machine */
log_info("%s: Journal file from other machine, rotating.", f->path);
return true;
case -EBUSY: /* Unclean shutdown */
log_info("%s: Unclean shutdown, rotating.", f->path);
return true;
case -EPROTONOSUPPORT: /* Unsupported feature */
log_info("%s: Unsupported feature, rotating.", f->path);
return true;
case -EBADMSG: /* Corrupted */
case -ENODATA: /* Truncated */
case -ESHUTDOWN: /* Already archived */
log_warning("%s: Journal file corrupted, rotating.", f->path);
return true;
case -EIDRM: /* Journal file has been deleted */
log_warning("%s: Journal file has been deleted, rotating.", f->path);
return true;
case -ETXTBSY: /* Journal file is from the future */
log_warning("%s: Journal file is from the future, rotating.", f->path);
return true;
case -EAFNOSUPPORT:
log_warning("%s: underlying file system does not support memory mapping or another required file system feature.", f->path);
return false;
default:
return false;
}
}
static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, size_t n, int priority) {
bool vacuumed = false, rotate = false;
struct dual_timestamp ts;
@@ -872,7 +824,7 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, size_t n
return;
}
if (vacuumed || !shall_try_append_again(f->file, r)) {
if (vacuumed || !journal_shall_try_append_again(f->file, r)) {
log_ratelimit_full_errno(LOG_ERR, r, "Failed to write entry (%zu items, %zu bytes), ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n));
return;
}
@@ -1202,7 +1154,7 @@ int server_flush_to_var(Server *s, bool require_flag_file) {
if (r >= 0)
continue;
if (!shall_try_append_again(s->system_journal->file, r)) {
if (!journal_shall_try_append_again(s->system_journal->file, r)) {
log_error_errno(r, "Can't write entry: %m");
goto finish;
}

View File

@@ -50,7 +50,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
if (r < 0)
return r;
n_items += journal_file_entry_array_n_items(&o);
n_items += journal_file_entry_array_n_items(f, &o);
p = q;
}
@@ -67,7 +67,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
return 0;
offset = p + offsetof(Object, entry_array.items) +
(journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t);
(journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f);
sz = p + le64toh(o.object.size) - offset;
if (sz < MINIMUM_HOLE_SIZE)

View File

@@ -13,7 +13,7 @@
#include "path-util.h"
#include "string-util.h"
int main(int argc, char *argv[]) {
static void test_journal_flush(int argc, char *argv[]) {
_cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;
_cleanup_free_ char *fn = NULL;
char dn[] = "/var/tmp/test-journal-flush.XXXXXX";
@@ -70,6 +70,14 @@ int main(int argc, char *argv[]) {
unlink(fn);
assert_se(rmdir(dn) == 0);
}
int main(int argc, char *argv[]) {
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);
test_journal_flush(argc, argv);
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);
test_journal_flush(argc, argv);
return 0;
}

View File

@@ -210,7 +210,7 @@ TEST(skip) {
test_skip_one(setup_interleaved);
}
TEST(sequence_numbers) {
static void test_sequence_numbers_one(void) {
_cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;
char t[] = "/var/tmp/journal-seq-XXXXXX";
ManagedJournalFile *one, *two;
@@ -295,6 +295,14 @@ TEST(sequence_numbers) {
}
}
TEST(sequence_numbers) {
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);
test_sequence_numbers_one();
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);
test_sequence_numbers_one();
}
static int intro(void) {
/* managed_journal_file_open requires a valid machine id */
if (access("/etc/machine-id", F_OK) != 0)

View File

@@ -184,12 +184,19 @@ int main(int argc, char *argv[]) {
test_setup_logging(LOG_DEBUG);
/* Run this test twice. Once with old hashing and once with new hashing */
assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "1", 1) >= 0);
run_test();
/* Run this test multiple times with different configurations of features. */
assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "0", 1) >= 0);
run_test();
assert_se(setenv("SYSTEMD_JOURNAL_KEYED_HASH", "1", 1) >= 0);
run_test();
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);
run_test();
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);
run_test();
return 0;
}

View File

@@ -56,7 +56,7 @@ static int raw_verify(const char *fn, const char *verification_key) {
return r;
}
int main(int argc, char *argv[]) {
static int run_test(int argc, char *argv[]) {
_cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;
char t[] = "/var/tmp/journal-XXXXXX";
unsigned n;
@@ -141,3 +141,13 @@ int main(int argc, char *argv[]) {
return 0;
}
int main(int argc, char *argv[]) {
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);
run_test(argc, argv);
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);
run_test(argc, argv);
return 0;
}

View File

@@ -23,7 +23,7 @@ static void mkdtemp_chdir_chattr(char *path) {
(void) chattr_path(path, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
}
TEST(non_empty) {
static void test_non_empty_one(void) {
_cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;
dual_timestamp ts;
ManagedJournalFile *f;
@@ -118,7 +118,15 @@ TEST(non_empty) {
puts("------------------------------------------------------------");
}
TEST(empty) {
TEST(non_empty) {
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);
test_non_empty_one();
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);
test_non_empty_one();
}
static void test_empty_one(void) {
_cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;
ManagedJournalFile *f1, *f2, *f3, *f4;
char t[] = "/var/tmp/journal-XXXXXX";
@@ -158,6 +166,14 @@ TEST(empty) {
(void) managed_journal_file_close(f4);
}
TEST(empty) {
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);
test_empty_one();
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);
test_empty_one();
}
#if HAVE_COMPRESSION
static bool check_compressed(uint64_t compress_threshold, uint64_t data_size) {
_cleanup_(mmap_cache_unrefp) MMapCache *m = NULL;
@@ -222,7 +238,7 @@ static bool check_compressed(uint64_t compress_threshold, uint64_t data_size) {
return is_compressed;
}
TEST(min_compress_size) {
static void test_min_compress_size_one(void) {
/* Note that XZ will actually fail to compress anything under 80 bytes, so you have to choose the limits
* carefully */
@@ -241,6 +257,14 @@ TEST(min_compress_size) {
assert_se(check_compressed(256, 256));
assert_se(!check_compressed(256, 255));
}
TEST(min_compress_size) {
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "0", 1) >= 0);
test_min_compress_size_one();
assert_se(setenv("SYSTEMD_JOURNAL_COMPACT", "1", 1) >= 0);
test_min_compress_size_one();
}
#endif
static int intro(void) {

View File

@@ -248,7 +248,7 @@ int journal_file_hmac_put_object(JournalFile *f, ObjectType type, Object *o, uin
case OBJECT_DATA:
/* All but hash and payload are mutable */
gcry_md_write(f->hmac, &o->data.hash, sizeof(o->data.hash));
gcry_md_write(f->hmac, o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
gcry_md_write(f->hmac, journal_file_data_payload_field(f, o), le64toh(o->object.size) - journal_file_data_payload_offset(f));
break;
case OBJECT_FIELD:

View File

@@ -24,7 +24,6 @@ typedef struct HashTableObject HashTableObject;
typedef struct EntryArrayObject EntryArrayObject;
typedef struct TagObject TagObject;
typedef struct EntryItem EntryItem;
typedef struct HashItem HashItem;
typedef struct FSSHeader FSSHeader;
@@ -66,8 +65,17 @@ struct ObjectHeader {
le64_t entry_offset; /* the first array entry we store inline */ \
le64_t entry_array_offset; \
le64_t n_entries; \
uint8_t payload[]; \
}
union { \
struct { \
uint8_t payload[0]; \
} regular; \
struct { \
le32_t tail_entry_array_offset; \
le32_t tail_entry_array_n_entries; \
uint8_t payload[0]; \
} compact; \
}; \
}
struct DataObject DataObject__contents;
struct DataObject__packed DataObject__contents _packed_;
@@ -85,20 +93,23 @@ struct FieldObject FieldObject__contents;
struct FieldObject__packed FieldObject__contents _packed_;
assert_cc(sizeof(struct FieldObject) == sizeof(struct FieldObject__packed));
struct EntryItem {
le64_t object_offset;
le64_t hash;
} _packed_;
#define EntryObject__contents { \
ObjectHeader object; \
le64_t seqnum; \
le64_t realtime; \
le64_t monotonic; \
sd_id128_t boot_id; \
le64_t xor_hash; \
EntryItem items[]; \
}
#define EntryObject__contents { \
ObjectHeader object; \
le64_t seqnum; \
le64_t realtime; \
le64_t monotonic; \
sd_id128_t boot_id; \
le64_t xor_hash; \
union { \
struct { \
le64_t object_offset; \
le64_t hash; \
} regular[0]; \
struct { \
le32_t object_offset; \
} compact[0]; \
} items; \
}
struct EntryObject EntryObject__contents;
struct EntryObject__packed EntryObject__contents _packed_;
@@ -117,7 +128,10 @@ struct HashTableObject {
struct EntryArrayObject {
ObjectHeader object;
le64_t next_entry_array_offset;
le64_t items[];
union {
le64_t regular[0];
le32_t compact[0];
} items;
} _packed_;
#define TAG_LENGTH (256/8)
@@ -152,19 +166,22 @@ enum {
HEADER_INCOMPATIBLE_COMPRESSED_LZ4 = 1 << 1,
HEADER_INCOMPATIBLE_KEYED_HASH = 1 << 2,
HEADER_INCOMPATIBLE_COMPRESSED_ZSTD = 1 << 3,
HEADER_INCOMPATIBLE_COMPACT = 1 << 4,
};
#define HEADER_INCOMPATIBLE_ANY \
(HEADER_INCOMPATIBLE_COMPRESSED_XZ | \
HEADER_INCOMPATIBLE_COMPRESSED_LZ4 | \
HEADER_INCOMPATIBLE_KEYED_HASH | \
HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)
HEADER_INCOMPATIBLE_COMPRESSED_ZSTD | \
HEADER_INCOMPATIBLE_COMPACT)
#define HEADER_INCOMPATIBLE_SUPPORTED \
((HAVE_XZ ? HEADER_INCOMPATIBLE_COMPRESSED_XZ : 0) | \
(HAVE_LZ4 ? HEADER_INCOMPATIBLE_COMPRESSED_LZ4 : 0) | \
(HAVE_ZSTD ? HEADER_INCOMPATIBLE_COMPRESSED_ZSTD : 0) | \
HEADER_INCOMPATIBLE_KEYED_HASH)
HEADER_INCOMPATIBLE_KEYED_HASH | \
HEADER_INCOMPATIBLE_COMPACT)
enum {
HEADER_COMPATIBLE_SEALED = 1 << 0,
@@ -214,12 +231,15 @@ enum {
/* Added in 246 */ \
le64_t data_hash_chain_depth; \
le64_t field_hash_chain_depth; \
/* Added in 252 */ \
le32_t tail_entry_array_offset; \
le32_t tail_entry_array_n_entries; \
}
struct Header struct_Header__contents;
struct Header__packed struct_Header__contents _packed_;
assert_cc(sizeof(struct Header) == sizeof(struct Header__packed));
assert_cc(sizeof(struct Header) == 256);
assert_cc(sizeof(struct Header) == 264);
#define FSS_HEADER_SIGNATURE \
((const char[]) { 'K', 'S', 'H', 'H', 'R', 'H', 'L', 'P' })

File diff suppressed because it is too large Load Diff

View File

@@ -127,6 +127,11 @@ typedef enum JournalFileFlags {
JOURNAL_SEAL = 1 << 1,
} JournalFileFlags;
typedef struct {
uint64_t object_offset;
uint64_t hash;
} EntryItem;
int journal_file_open(
int fd,
const char *fname,
@@ -184,14 +189,64 @@ static inline bool VALID_EPOCH(uint64_t u) {
#define JOURNAL_HEADER_KEYED_HASH(h) \
FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_KEYED_HASH)
#define JOURNAL_HEADER_COMPACT(h) \
FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPACT)
int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret);
int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret);
int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset);
int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset);
uint64_t journal_file_entry_n_items(Object *o) _pure_;
uint64_t journal_file_entry_array_n_items(Object *o) _pure_;
static inline uint64_t journal_file_entry_item_object_offset(JournalFile *f, Object *o, size_t i) {
assert(f);
assert(o);
return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry.items.compact[i].object_offset) :
le64toh(o->entry.items.regular[i].object_offset);
}
static inline size_t journal_file_entry_item_size(JournalFile *f) {
assert(f);
return JOURNAL_HEADER_COMPACT(f->header) ? sizeof_field(Object, entry.items.compact[0]) :
sizeof_field(Object, entry.items.regular[0]);
}
uint64_t journal_file_entry_n_items(JournalFile *f, Object *o) _pure_;
int journal_file_data_payload(
JournalFile *f,
Object *o,
uint64_t offset,
const char *field,
size_t field_length,
size_t data_threshold,
void **ret_data,
size_t *ret_size);
static inline size_t journal_file_data_payload_offset(JournalFile *f) {
return JOURNAL_HEADER_COMPACT(f->header)
? offsetof(Object, data.compact.payload)
: offsetof(Object, data.regular.payload);
}
static inline uint8_t* journal_file_data_payload_field(JournalFile *f, Object *o) {
return JOURNAL_HEADER_COMPACT(f->header) ? o->data.compact.payload : o->data.regular.payload;
}
uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_;
static inline uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) {
assert(f);
assert(o);
return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry_array.items.compact[i]) :
le64toh(o->entry_array.items.regular[i]);
}
static inline size_t journal_file_entry_array_item_size(JournalFile *f) {
assert(f);
return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t);
}
uint64_t journal_file_hash_table_n_items(Object *o) _pure_;
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset);

View File

@@ -170,16 +170,16 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
return -EBADMSG;
}
if (le64toh(o->object.size) - offsetof(Object, data.payload) <= 0) {
if (le64toh(o->object.size) - journal_file_data_payload_offset(f) <= 0) {
error(offset, "Bad object size (<= %zu): %"PRIu64,
offsetof(Object, data.payload),
journal_file_data_payload_offset(f),
le64toh(o->object.size));
return -EBADMSG;
}
h1 = le64toh(o->data.hash);
r = hash_payload(f, o, offset, o->data.payload,
le64toh(o->object.size) - offsetof(Object, data.payload),
r = hash_payload(f, o, offset, journal_file_data_payload_field(f, o),
le64toh(o->object.size) - journal_file_data_payload_offset(f),
&h2);
if (r < 0)
return r;
@@ -240,7 +240,7 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
}
case OBJECT_ENTRY:
if ((le64toh(o->object.size) - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0) {
if ((le64toh(o->object.size) - offsetof(Object, entry.items)) % journal_file_entry_item_size(f) != 0) {
error(offset,
"Bad entry size (<= %zu): %"PRIu64,
offsetof(Object, entry.items),
@@ -248,10 +248,10 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
return -EBADMSG;
}
if ((le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0) {
if ((le64toh(o->object.size) - offsetof(Object, entry.items)) / journal_file_entry_item_size(f) <= 0) {
error(offset,
"Invalid number items in entry: %"PRIu64,
(le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem));
(le64toh(o->object.size) - offsetof(Object, entry.items)) / journal_file_entry_item_size(f));
return -EBADMSG;
}
@@ -276,13 +276,13 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
return -EBADMSG;
}
for (uint64_t i = 0; i < journal_file_entry_n_items(o); i++) {
if (le64toh(o->entry.items[i].object_offset) == 0 ||
!VALID64(le64toh(o->entry.items[i].object_offset))) {
for (uint64_t i = 0; i < journal_file_entry_n_items(f, o); i++) {
if (journal_file_entry_item_object_offset(f, o, i) == 0 ||
!VALID64(journal_file_entry_item_object_offset(f, o, i))) {
error(offset,
"Invalid entry item (%"PRIu64"/%"PRIu64") offset: "OFSfmt,
i, journal_file_entry_n_items(o),
le64toh(o->entry.items[i].object_offset));
i, journal_file_entry_n_items(f, o),
journal_file_entry_item_object_offset(f, o, i));
return -EBADMSG;
}
}
@@ -335,8 +335,8 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
break;
case OBJECT_ENTRY_ARRAY:
if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
(le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) {
if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
(le64toh(o->object.size) - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) {
error(offset,
"Invalid object entry array size: %"PRIu64,
le64toh(o->object.size));
@@ -350,15 +350,15 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
return -EBADMSG;
}
for (uint64_t i = 0; i < journal_file_entry_array_n_items(o); i++)
if (le64toh(o->entry_array.items[i]) != 0 &&
!VALID64(le64toh(o->entry_array.items[i]))) {
for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) {
uint64_t q = journal_file_entry_array_item(f, o, i);
if (q != 0 && !VALID64(q)) {
error(offset,
"Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt,
i, journal_file_entry_array_n_items(o),
le64toh(o->entry_array.items[i]));
i, journal_file_entry_array_n_items(f, o), q);
return -EBADMSG;
}
}
break;
@@ -490,10 +490,10 @@ static int verify_data(
return -EBADMSG;
}
m = journal_file_entry_array_n_items(o);
m = journal_file_entry_array_n_items(f, o);
for (j = 0; i < n && j < m; i++, j++) {
q = le64toh(o->entry_array.items[j]);
q = journal_file_entry_array_item(f, o, j);
if (q <= last) {
error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last);
return -EBADMSG;
@@ -646,12 +646,12 @@ static int verify_entry(
assert(o);
assert(cache_data_fd);
n = journal_file_entry_n_items(o);
n = journal_file_entry_n_items(f, o);
for (i = 0; i < n; i++) {
uint64_t q;
Object *u;
q = le64toh(o->entry.items[i].object_offset);
q = journal_file_entry_item_object_offset(f, o, i);
if (!contains_uint64(cache_data_fd, n_data, q)) {
error(p, "Invalid data object of entry");
@@ -737,11 +737,11 @@ static int verify_entry_array(
return -EBADMSG;
}
m = journal_file_entry_array_n_items(o);
m = journal_file_entry_array_n_items(f, o);
for (j = 0; i < n && j < m; i++, j++) {
uint64_t p;
p = le64toh(o->entry_array.items[j]);
p = journal_file_entry_array_item(f, o, j);
if (p <= last) {
error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n);
return -EBADMSG;

View File

@@ -2287,15 +2287,16 @@ _public_ int sd_journal_get_data(sd_journal *j, const char *field, const void **
field_length = strlen(field);
uint64_t n = journal_file_entry_n_items(o);
uint64_t n = journal_file_entry_n_items(f, o);
for (uint64_t i = 0; i < n; i++) {
Object *d;
uint64_t p, l;
size_t t;
Compression c;
uint64_t p;
void *d;
size_t l;
p = le64toh(o->entry.items[i].object_offset);
r = journal_file_move_to_object(f, OBJECT_DATA, p, &d);
p = journal_file_entry_item_object_offset(f, o, i);
r = journal_file_data_payload(f, NULL, p, field, field_length, j->data_threshold, &d, &l);
if (r == 0)
continue;
if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {
log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", i);
continue;
@@ -2303,117 +2304,15 @@ _public_ int sd_journal_get_data(sd_journal *j, const char *field, const void **
if (r < 0)
return r;
l = le64toh(d->object.size) - offsetof(Object, data.payload);
*data = d;
*size = l;
c = COMPRESSION_FROM_OBJECT(d);
if (c < 0)
return -EPROTONOSUPPORT;
if (c != COMPRESSION_NONE) {
#if HAVE_COMPRESSION
r = decompress_startswith(
c,
d->data.payload, l,
&f->compress_buffer,
field, field_length, '=');
if (r < 0)
log_debug_errno(r, "Cannot decompress %s object of length %"PRIu64" at offset "OFSfmt": %m",
compression_to_string(c), l, p);
else if (r > 0) {
size_t rsize;
r = decompress_blob(
c,
d->data.payload, l,
&f->compress_buffer, &rsize,
j->data_threshold);
if (r < 0)
return r;
*data = f->compress_buffer;
*size = (size_t) rsize;
return 0;
}
#else
return -EPROTONOSUPPORT;
#endif
} else if (l >= field_length+1 &&
memcmp(d->data.payload, field, field_length) == 0 &&
d->data.payload[field_length] == '=') {
t = (size_t) l;
if ((uint64_t) t != l)
return -E2BIG;
*data = d->data.payload;
*size = t;
return 0;
}
return 0;
}
return -ENOENT;
}
static int return_data(
sd_journal *j,
JournalFile *f,
Object *o,
const void **ret_data,
size_t *ret_size) {
Compression c;
uint64_t l;
size_t t;
assert(j);
assert(f);
l = le64toh(READ_NOW(o->object.size));
if (l < offsetof(Object, data.payload))
return -EBADMSG;
l -= offsetof(Object, data.payload);
/* We can't read objects larger than 4G on a 32bit machine */
t = (size_t) l;
if ((uint64_t) t != l)
return -E2BIG;
c = COMPRESSION_FROM_OBJECT(o);
if (c < 0)
return -EPROTONOSUPPORT;
if (c != COMPRESSION_NONE) {
#if HAVE_COMPRESSION
size_t rsize;
int r;
r = decompress_blob(
c,
o->data.payload, l,
&f->compress_buffer, &rsize,
j->data_threshold);
if (r < 0)
return r;
if (ret_data)
*ret_data = f->compress_buffer;
if (ret_size)
*ret_size = (size_t) rsize;
#else
return -EPROTONOSUPPORT;
#endif
} else {
if (ret_data)
*ret_data = o->data.payload;
if (ret_size)
*ret_size = t;
}
return 0;
}
_public_ int sd_journal_enumerate_data(sd_journal *j, const void **data, size_t *size) {
JournalFile *f;
Object *o;
@@ -2435,25 +2334,23 @@ _public_ int sd_journal_enumerate_data(sd_journal *j, const void **data, size_t
if (r < 0)
return r;
for (uint64_t n = journal_file_entry_n_items(o); j->current_field < n; j->current_field++) {
for (uint64_t n = journal_file_entry_n_items(f, o); j->current_field < n; j->current_field++) {
uint64_t p;
void *d;
size_t l;
p = le64toh(o->entry.items[j->current_field].object_offset);
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
p = journal_file_entry_item_object_offset(f, o, j->current_field);
r = journal_file_data_payload(f, NULL, p, NULL, 0, j->data_threshold, &d, &l);
if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {
log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", j->current_field);
continue;
}
if (r < 0)
return r;
assert(r > 0);
r = return_data(j, f, o, data, size);
if (r == -EBADMSG) {
log_debug("Entry item %"PRIu64" data payload is bad, skipping over it.", j->current_field);
continue;
}
if (r < 0)
return r;
*data = d;
*size = l;
j->current_field++;
@@ -2925,7 +2822,7 @@ _public_ int sd_journal_enumerate_unique(
for (;;) {
JournalFile *of;
Object *o;
const void *odata;
void *odata;
size_t ol;
bool found;
int r;
@@ -2969,7 +2866,8 @@ _public_ int sd_journal_enumerate_unique(
j->unique_offset,
o->object.type, OBJECT_DATA);
r = return_data(j, j->unique_file, o, &odata, &ol);
r = journal_file_data_payload(j->unique_file, o, j->unique_offset, NULL, 0,
j->data_threshold, &odata, &ol);
if (r < 0)
return r;
@@ -3016,9 +2914,8 @@ _public_ int sd_journal_enumerate_unique(
if (found)
continue;
r = return_data(j, j->unique_file, o, ret_data, ret_size);
if (r < 0)
return r;
*ret_data = odata;
*ret_size = ol;
return 1;
}

View File

@@ -136,3 +136,52 @@ int journal_access_check_and_warn(sd_journal *j, bool quiet, bool want_other_use
return r;
}
bool journal_shall_try_append_again(JournalFile *f, int r) {
switch (r) {
case -E2BIG: /* Hit configured limit */
case -EFBIG: /* Hit fs limit */
case -EDQUOT: /* Quota limit hit */
case -ENOSPC: /* Disk full */
log_debug("%s: Allocation limit reached, rotating.", f->path);
return true;
case -EIO: /* I/O error of some kind (mmap) */
log_warning("%s: IO error, rotating.", f->path);
return true;
case -EHOSTDOWN: /* Other machine */
log_info("%s: Journal file from other machine, rotating.", f->path);
return true;
case -EBUSY: /* Unclean shutdown */
log_info("%s: Unclean shutdown, rotating.", f->path);
return true;
case -EPROTONOSUPPORT: /* Unsupported feature */
log_info("%s: Unsupported feature, rotating.", f->path);
return true;
case -EBADMSG: /* Corrupted */
case -ENODATA: /* Truncated */
case -ESHUTDOWN: /* Already archived */
log_warning("%s: Journal file corrupted, rotating.", f->path);
return true;
case -EIDRM: /* Journal file has been deleted */
log_warning("%s: Journal file has been deleted, rotating.", f->path);
return true;
case -ETXTBSY: /* Journal file is from the future */
log_warning("%s: Journal file is from the future, rotating.", f->path);
return true;
case -EAFNOSUPPORT:
log_warning("%s: underlying file system does not support memory mapping or another required file system feature.", f->path);
return false;
default:
return false;
}
}

View File

@@ -6,5 +6,9 @@
#include "sd-journal.h"
#include "journal-internal.h"
int journal_access_blocked(sd_journal *j);
int journal_access_check_and_warn(sd_journal *j, bool quiet, bool want_other_users);
bool journal_shall_try_append_again(JournalFile *f, int r);