journalctl: find boot ID more gracefully in corrupted journal

In discover_next_boot(), first we find a new boot ID based on the value
stored in the entry object. Then, find the tail (or head when we are going
upwards) entry of the boot based on the _BOOT_ID= field data.

If boot IDs of an entry in the entry object and _BOOT_ID field data
are inconsistent, which may happen on corrupted journal, then previously
discover_next_boot() failed with -ENODATA.

This makes the function check if the two boot IDs in each entry are
consistent, and skip the entry if not.

Fixes the failure of `journalctl -b -1` for 'truncated' journal:
https://github.com/systemd/systemd/pull/29334#issuecomment-1736567951
This commit is contained in:
Yu Watanabe
2023-09-28 06:04:53 +09:00
parent b503c76689
commit 185e201690
2 changed files with 100 additions and 29 deletions

View File

@@ -1836,7 +1836,7 @@ static int discover_next_boot(
bool advance_older,
BootId *ret) {
BootId boot;
_cleanup_set_free_ Set *broken_ids = NULL;
int r;
assert(j);
@@ -1853,7 +1853,10 @@ static int discover_next_boot(
* we can actually advance to a *different* boot. */
sd_journal_flush_matches(j);
do {
for (;;) {
sd_id128_t *id_dup;
BootId boot;
r = sd_journal_step_one(j, !advance_older);
if (r < 0)
return r;
@@ -1874,38 +1877,106 @@ static int discover_next_boot(
* speed things up, but let's not trust that it is complete, and hence, manually advance as
* necessary. */
} while (sd_id128_equal(boot.id, previous_boot_id));
if (!sd_id128_is_null(previous_boot_id) && sd_id128_equal(boot.id, previous_boot_id))
continue;
r = sd_journal_get_realtime_usec(j, &boot.first_usec);
if (r < 0)
return r;
if (set_contains(broken_ids, &boot.id))
continue;
/* Now seek to the last occurrence of this boot ID. */
r = add_match_boot_id(j, boot.id);
if (r < 0)
return r;
/* Yay, we found a new boot ID from the entry object. Let's check there exist corresponding
* entries matching with the _BOOT_ID= data. */
if (advance_older)
r = sd_journal_seek_head(j);
else
r = sd_journal_seek_tail(j);
if (r < 0)
return r;
r = add_match_boot_id(j, boot.id);
if (r < 0)
return r;
r = sd_journal_step_one(j, advance_older);
if (r < 0)
return r;
if (r == 0)
return log_debug_errno(SYNTHETIC_ERRNO(ENODATA),
"Whoopsie! We found a boot ID but can't read its last entry."); /* This shouldn't happen. We just came from this very boot ID. */
/* First, seek to the first (or the last when we are going upwards) occurrence of this boot ID.
* You may think this is redundant. Yes, that's redundant unless the journal is corrupted.
* But when the journal is corrupted, especially, badly 'truncated', then the below may fail.
* See https://github.com/systemd/systemd/pull/29334#issuecomment-1736567951. */
if (advance_older)
r = sd_journal_seek_tail(j);
else
r = sd_journal_seek_head(j);
if (r < 0)
return r;
r = sd_journal_get_realtime_usec(j, &boot.last_usec);
if (r < 0)
return r;
r = sd_journal_step_one(j, 0);
if (r < 0)
return r;
if (r == 0) {
log_debug("Whoopsie! We found a boot ID %s but can't read its first entry. "
"The journal seems to be corrupted. Ignoring the boot ID.",
SD_ID128_TO_STRING(boot.id));
goto try_again;
}
sd_journal_flush_matches(j);
*ret = boot;
return 1;
r = sd_journal_get_realtime_usec(j, &boot.first_usec);
if (r < 0)
return r;
/* Next, seek to the last occurrence of this boot ID. */
if (advance_older)
r = sd_journal_seek_head(j);
else
r = sd_journal_seek_tail(j);
if (r < 0)
return r;
r = sd_journal_step_one(j, 0);
if (r < 0)
return r;
if (r == 0) {
log_debug("Whoopsie! We found a boot ID %s but can't read its last entry. "
"The journal seems to be corrupted. Ignoring the boot ID.",
SD_ID128_TO_STRING(boot.id));
goto try_again;
}
r = sd_journal_get_realtime_usec(j, &boot.last_usec);
if (r < 0)
return r;
sd_journal_flush_matches(j);
*ret = boot;
return 1;
try_again:
/* Save the bad boot ID. */
id_dup = newdup(sd_id128_t, &boot.id, 1);
if (!id_dup)
return -ENOMEM;
r = set_ensure_consume(&broken_ids, &id128_hash_ops_free, id_dup);
if (r < 0)
return r;
/* Move to the previous position again. */
sd_journal_flush_matches(j);
if (!sd_id128_is_null(previous_boot_id)) {
r = add_match_boot_id(j, previous_boot_id);
if (r < 0)
return r;
}
if (advance_older)
r = sd_journal_seek_head(j);
else
r = sd_journal_seek_tail(j);
if (r < 0)
return r;
r = sd_journal_step_one(j, 0);
if (r < 0)
return r;
if (r == 0)
return log_debug_errno(SYNTHETIC_ERRNO(ENODATA),
"Whoopsie! Cannot seek to the last entry of boot %s.",
SD_ID128_TO_STRING(previous_boot_id));
sd_journal_flush_matches(j);
}
}
int journal_find_boot_by_id(sd_journal *j, sd_id128_t boot_id) {

View File

@@ -233,6 +233,6 @@ done < <(find /test-journals/no-rtc -name "*.zst")
journalctl --directory="$JOURNAL_DIR" --list-boots --output=json >/tmp/lb1
diff -u /tmp/lb1 - <<'EOF'
[{"index":-3,"boot_id":"5ea5fc4f82a14186b5332a788ef9435e","first_entry":1666569600994371,"last_entry":1666584266223608},{"index":-2,"boot_id":"bea6864f21ad4c9594c04a99d89948b0","first_entry":1666584266731785,"last_entry":1666584347230411},{"index":-1,"boot_id":"4c708e1fd0744336be16f3931aa861fb","first_entry":1666584348378271,"last_entry":1666584354649355},{"index":0,"boot_id":"35e8501129134edd9df5267c49f744a4","first_entry":1666584356661527,"last_entry":1666584438086856}]
[{"index":-3,"boot_id":"5ea5fc4f82a14186b5332a788ef9435e","first_entry":1666569600994371,"last_entry":1666584266223608},{"index":-2,"boot_id":"bea6864f21ad4c9594c04a99d89948b0","first_entry":1666569601005945,"last_entry":1666584347230411},{"index":-1,"boot_id":"4c708e1fd0744336be16f3931aa861fb","first_entry":1666569601017222,"last_entry":1666584354649355},{"index":0,"boot_id":"35e8501129134edd9df5267c49f744a4","first_entry":1666569601009823,"last_entry":1666584438086856}]
EOF
rm -rf "$JOURNAL_DIR" /tmp/lb1