0
0
mirror of https://github.com/tursodatabase/libsql.git synced 2025-05-25 15:40:38 +00:00

compute current segment running checksum

This commit is contained in:
ad hoc
2024-07-30 11:29:06 +02:00
parent d6b9e496b2
commit 47ed09a2da
7 changed files with 177 additions and 33 deletions

@ -8,6 +8,7 @@ use libsql_sys::ffi::Sqlite3DbHeader;
use parking_lot::{Condvar, Mutex};
use tokio::sync::{mpsc, Notify, Semaphore};
use tokio::task::JoinSet;
use rand::Rng;
use zerocopy::{AsBytes, FromZeroes};
use crate::checkpointer::CheckpointMessage;
@ -32,7 +33,7 @@ enum Slot<IO: Io> {
/// Wal Registry maintains a set of shared Wal, and their respective set of files.
pub struct WalRegistry<IO: Io, S> {
fs: IO,
io: IO,
path: PathBuf,
shutdown: AtomicBool,
opened: DashMap<NamespaceName, Slot<IO>>,
@ -59,7 +60,7 @@ impl<IO: Io, S> WalRegistry<IO, S> {
) -> Result<Self> {
io.create_dir_all(&path)?;
let registry = Self {
fs: io,
io,
path,
opened: Default::default(),
shutdown: Default::default(),
@ -105,14 +106,15 @@ where
.join(shared.namespace().as_str())
.join(format!("{}:{start_frame_no:020}.seg", shared.namespace()));
let segment_file = self.fs.open(true, true, true, &path)?;
let segment_file = self.io.open(true, true, true, &path)?;
let salt = self.io.with_rng(|rng| rng.gen());
let new = CurrentSegment::create(
segment_file,
path,
start_frame_no,
current.db_size(),
current.tail().clone(),
salt,
)?;
// sealing must the last fallible operation, because we don't want to end up in a situation
// where the current log is sealed and it wasn't swapped.
@ -226,7 +228,7 @@ where
db_path: &Path,
) -> Result<Arc<SharedWal<IO>>> {
let path = self.path.join(namespace.as_str());
self.fs.create_dir_all(&path)?;
self.io.create_dir_all(&path)?;
// TODO: handle that with abstract io
let dir = walkdir::WalkDir::new(&path).sort_by_file_name().into_iter();
@ -246,7 +248,7 @@ where
continue;
}
let file = self.fs.open(false, true, true, entry.path())?;
let file = self.io.open(false, true, true, entry.path())?;
if let Some(sealed) =
SealedSegment::open(file.into(), entry.path().to_path_buf(), Default::default())?
@ -265,7 +267,7 @@ where
}
}
let db_file = self.fs.open(false, true, true, db_path)?;
let db_file = self.io.open(false, true, true, db_path)?;
let mut header: Sqlite3DbHeader = Sqlite3DbHeader::new_zeroed();
db_file.read_exact_at(header.as_bytes_mut(), 0)?;
@ -283,7 +285,8 @@ where
let current_path = path.join(format!("{namespace}:{next_frame_no:020}.seg"));
let segment_file = self.fs.open(true, true, true, &current_path)?;
let segment_file = self.io.open(true, true, true, &current_path)?;
let salt = self.io.with_rng(|rng| rng.gen());
let current = arc_swap::ArcSwap::new(Arc::new(CurrentSegment::create(
segment_file,
@ -291,6 +294,7 @@ where
next_frame_no,
db_size,
tail.into(),
salt,
)?));
let (new_frame_notifier, _) = tokio::sync::watch::channel(next_frame_no.get() - 1);

@ -54,7 +54,7 @@ impl<'a, IO: Io> Injector<'a, IO> {
self.max_tx_frame_no = 0;
}
let buffer = current
.insert_frames(buffer, commit_data, &mut self.tx)
.insert_frames_inject(buffer, commit_data, &mut self.tx)
.await?;
self.buffer = buffer;

@ -1,7 +1,9 @@
use std::hash::Hasher;
use std::io::{BufWriter, IoSlice, Write};
use std::num::NonZeroU64;
use std::ops::DerefMut;
use std::path::PathBuf;
use std::sync::atomic::AtomicU32;
use std::sync::{
atomic::{AtomicBool, AtomicU64, Ordering},
Arc,
@ -16,9 +18,11 @@ use zerocopy::{AsBytes, FromZeroes};
use crate::io::buf::{IoBufMut, ZeroCopyBoxIoBuf, ZeroCopyBuf};
use crate::io::file::FileExt;
use crate::segment::SegmentFlags;
use crate::io::Inspect;
use crate::segment::{checked_frame_offset, SegmentFlags};
use crate::segment::{frame_offset, page_offset, sealed::SealedSegment};
use crate::transaction::{Transaction, TxGuard};
use crate::LIBSQL_MAGIC;
use super::list::SegmentList;
use super::{Frame, FrameHeader, SegmentHeader};
@ -34,6 +38,8 @@ pub struct CurrentSegment<F> {
/// lock
read_locks: Arc<AtomicU64>,
sealed: AtomicBool,
/// current runnign checksum
current_checksum: AtomicU32,
tail: Arc<SegmentList<SealedSegment<F>>>,
}
@ -46,6 +52,7 @@ impl<F> CurrentSegment<F> {
start_frame_no: NonZeroU64,
db_size: u32,
tail: Arc<SegmentList<SealedSegment<F>>>,
salt: u32,
) -> Result<Self>
where
F: FileExt,
@ -60,6 +67,7 @@ impl<F> CurrentSegment<F> {
flags: 0.into(),
magic: LIBSQL_MAGIC.into(),
version: 1.into(),
salt: salt.into(),
};
header.recompute_checksum();
@ -74,6 +82,7 @@ impl<F> CurrentSegment<F> {
read_locks: Arc::new(AtomicU64::new(0)),
sealed: AtomicBool::default(),
tail,
current_checksum: salt.into(),
})
}
@ -102,10 +111,14 @@ impl<F> CurrentSegment<F> {
self.header.lock().size_after.get()
}
pub fn current_checksum(&self) -> u32 {
self.current_checksum.load(Ordering::Relaxed)
}
/// insert a bunch of frames in the Wal. The frames needn't be ordered, therefore, on commit
/// the last frame no needs to be passed alongside the new size_after.
#[tracing::instrument(skip_all)]
pub async fn insert_frames(
pub async fn insert_frames_inject(
&self,
frames: Vec<Box<Frame>>,
// (size_after, last_frame_no)
@ -116,19 +129,40 @@ impl<F> CurrentSegment<F> {
F: FileExt,
{
assert!(!self.sealed.load(Ordering::SeqCst));
assert_eq!(
tx.savepoints.len(),
1,
"injecting wal should not use savepoints"
);
{
let tx = tx.deref_mut();
// let mut commit_frame_written = false;
let current_savepoint = tx.savepoints.last_mut().expect("no savepoints initialized");
let mut frames = frame_list_to_option(frames);
// For each frame, we compute and write the frame checksum, followed by the frame
// itself as an array of CheckedFrame
for i in 0..frames.len() {
let offset = tx.next_offset;
let current_checksum = current_savepoint.current_checksum;
let mut digest = crc32fast::Hasher::new_with_initial(current_checksum);
digest.write(frames[i].as_ref().unwrap().as_bytes());
let new_checksum = digest.finalize();
let (_buf, ret) = self
.file
.write_all_at_async(
ZeroCopyBuf::new_init(zerocopy::byteorder::little_endian::U32::new(
new_checksum,
)),
checked_frame_offset(offset),
)
.await;
ret?;
let buf = ZeroCopyBoxIoBuf::new(frames[i].take().unwrap());
let (buf, ret) = self
.file
.write_all_at_async(buf, frame_offset(offset))
.await;
ret?;
let frame = buf.into_inner();
@ -136,6 +170,7 @@ impl<F> CurrentSegment<F> {
current_savepoint
.index
.insert(frame.header().page_no(), offset);
current_savepoint.current_checksum = new_checksum;
tx.next_offset += 1;
frames[i] = Some(frame);
}
@ -161,6 +196,8 @@ impl<F> CurrentSegment<F> {
tx.merge_savepoints(&self.index);
// set the header last, so that a transaction does not witness a write before
// it's actually committed.
self.current_checksum
.store(tx.current_checksum(), Ordering::Relaxed);
*self.header.lock() = header.into_inner();
tx.is_commited = true;
@ -194,6 +231,11 @@ impl<F> CurrentSegment<F> {
if let Some(offset) = current_savepoint.index.get(&page_no) {
tracing::trace!(page_no, "recycling frame");
self.file.write_all_at(page, page_offset(*offset))?;
// we overwrote a frame, record that for later rewrite
tx.recompute_checksum = Some(tx
.recompute_checksum
.map(|old| old.min(*offset))
.unwrap_or(*offset));
continue;
}
@ -210,17 +252,31 @@ impl<F> CurrentSegment<F> {
size_after: size_after.into(),
frame_no: frame_no.into(),
};
let slices = &[IoSlice::new(header.as_bytes()), IoSlice::new(&page)];
let mut digest =
crc32fast::Hasher::new_with_initial(current_savepoint.current_checksum);
digest.write(header.as_bytes());
digest.write(page);
let checksum = digest.finalize();
let checksum_bytes = checksum.to_le_bytes();
// We write a instance of a ChecksummedFrame
let slices = &[
IoSlice::new(&checksum_bytes),
IoSlice::new(header.as_bytes()),
IoSlice::new(&page),
];
let offset = tx.next_offset;
debug_assert_eq!(
self.header.lock().start_frame_no.get() + offset as u64,
frame_no
);
self.file.write_at_vectored(slices, frame_offset(offset))?;
self.file.write_at_vectored(slices, checked_frame_offset(offset))?;
assert!(
current_savepoint.index.insert(page_no, offset).is_none(),
"existing frames should be recycled"
);
current_savepoint.current_checksum = checksum;
tx.next_frame_no += 1;
tx.next_offset += 1;
}
@ -228,6 +284,9 @@ impl<F> CurrentSegment<F> {
if let Some(size_after) = size_after {
if tx.not_empty() {
if let Some(_offset) = tx.recompute_checksum {
todo!("recompute checksum");
}
let last_frame_no = tx.next_frame_no - 1;
let mut header = { *self.header.lock() };
header.last_commited_frame_no = last_frame_no.into();
@ -240,6 +299,8 @@ impl<F> CurrentSegment<F> {
// set the header last, so that a transaction does not witness a write before
// it's actually committed.
*self.header.lock() = header;
self.current_checksum
.store(tx.current_checksum(), Ordering::Relaxed);
tx.is_commited = true;
@ -281,7 +342,7 @@ impl<F> CurrentSegment<F> {
F: FileExt,
B: IoBufMut + Send + 'static,
{
let byte_offset = frame_offset(offset);
let byte_offset = dbg!(frame_offset(dbg!(offset)));
self.file.read_exact_at_async(buf, byte_offset).await
}
@ -304,10 +365,21 @@ impl<F> CurrentSegment<F> {
{
let mut header = self.header.lock();
let index_offset = header.count_committed() as u32;
let index_byte_offset = frame_offset(index_offset);
let index_byte_offset = checked_frame_offset(index_offset);
let mut cursor = self.file.cursor(index_byte_offset);
let mut writer = BufWriter::new(&mut cursor);
let writer = BufWriter::new(&mut cursor);
let current = self.current_checksum();
let mut digest = crc32fast::Hasher::new_with_initial(current);
let mut writer = Inspect::new(writer, |data: &[u8]| {
digest.write(data);
});
self.index.merge_all(&mut writer)?;
let mut writer = writer.into_inner();
let index_checksum = digest.finalize();
let index_size = writer.get_ref().count();
writer.write_all(&index_checksum.to_le_bytes())?;
writer.into_inner().map_err(|e| e.into_parts().0)?;
// we perform a first sync to ensure that all the segment has been flushed to disk. We then
// write the header and flush again. We want to guarantee that if we find a segement marked
@ -318,10 +390,10 @@ impl<F> CurrentSegment<F> {
self.file.sync_all()?;
header.index_offset = index_byte_offset.into();
header.index_size = cursor.count().into();
header.recompute_checksum();
header.index_size = index_size.into();
let flags = header.flags();
header.set_flags(flags | SegmentFlags::SEALED);
header.recompute_checksum();
self.file.write_all_at(header.as_bytes(), 0)?;
// flush the header.

@ -8,18 +8,20 @@
//! head segment at the moment it was created.
#![allow(dead_code)]
use std::future::Future;
use std::hash::Hasher as _;
use std::io;
use std::mem::offset_of;
use std::mem::size_of;
use std::num::NonZeroU64;
use std::sync::Arc;
use zerocopy::byteorder::little_endian::{U32, U64};
use zerocopy::byteorder::little_endian::{U16, U32, U64};
use zerocopy::AsBytes;
use crate::error::{Error, Result};
use crate::io::buf::IoBufMut;
use crate::io::FileExt;
use crate::LIBSQL_MAGIC;
pub(crate) mod compacted;
pub mod current;
@ -32,6 +34,7 @@ bitflags::bitflags! {
/// This is true for a segment created by a primary, but a replica may insert frames in any
/// order, as long as commit boundaries are preserved.
const FRAME_UNORDERED = 1 << 0;
/// The segment is sealed. If this flag is set, then
const SEALED = 1 << 1;
}
}
@ -49,21 +52,21 @@ pub struct SegmentHeader {
pub size_after: U32,
/// byte offset of the index. If 0, then the index wasn't written, and must be recovered.
/// If non-0, the segment is sealed, and must not be written to anymore
/// the index is followed by its checksum
pub index_offset: U64,
pub index_size: U64,
/// checksum of the header fields, excluding the checksum itself. This field must be the last
pub header_cheksum: U64,
pub flags: U32,
/// salt for the segment checksum
pub salt: U32,
/// checksum of the header fields, excluding the checksum itself. This field must be the last
pub header_cheksum: U32,
}
impl SegmentHeader {
fn checksum(&self) -> u64 {
fn checksum(&self) -> u32 {
let field_bytes: &[u8] = &self.as_bytes()[..offset_of!(SegmentHeader, header_cheksum)];
let checksum = field_bytes
.iter()
.map(|x| *x as u64)
.reduce(|a, b| a ^ b)
.unwrap_or(0);
let checksum = crc32fast::hash(field_bytes);
checksum
}
@ -231,6 +234,28 @@ impl FrameHeader {
pub fn set_size_after(&mut self, size_after: u32) {
self.size_after = size_after.into();
}
pub fn is_commit(&self) -> bool {
self.size_after() != 0
}
}
/// A page with a running runnign checksum prepended.
/// `checksum` is computed by taking the checksum of the previous frame and crc32'ing it with frame
/// data (header and page content). The first page is hashed with the segment header salt.
#[repr(C)]
#[derive(Debug, zerocopy::AsBytes, zerocopy::FromBytes, zerocopy::FromZeroes)]
pub struct CheckedFrame {
checksum: U32,
// frame should always be the last field
frame: Frame,
}
impl CheckedFrame {
pub(crate) const fn offset_of_frame() -> usize {
offset_of!(Self, frame)
}
}
#[repr(C)]
@ -241,6 +266,12 @@ pub struct Frame {
}
impl Frame {
pub(crate) fn checksum(&self, previous_checksum: u32) -> u32 {
let mut digest = crc32fast::Hasher::new_with_initial(previous_checksum);
digest.write(self.as_bytes());
digest.finalize()
}
pub fn data(&self) -> &[u8] {
&self.data
}
@ -259,10 +290,37 @@ impl Frame {
}
}
/// offset of the CheckedFrame in a current of sealed segment
#[inline]
fn checked_frame_offset(offset: u32) -> u64 {
(size_of::<SegmentHeader>() + (offset as usize) * size_of::<CheckedFrame>()) as u64
}
/// offset of a Frame in a current or sealed segment.
#[inline]
fn frame_offset(offset: u32) -> u64 {
(size_of::<SegmentHeader>() + (offset as usize) * size_of::<Frame>()) as u64
checked_frame_offset(offset) + CheckedFrame::offset_of_frame() as u64
}
/// offset of a frame's page in a current or sealed segment.
#[inline]
fn page_offset(offset: u32) -> u64 {
frame_offset(offset) + size_of::<FrameHeader>() as u64
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn offsets() {
assert_eq!(checked_frame_offset(0) as usize, size_of::<SegmentHeader>());
assert_eq!(
frame_offset(0) as usize,
size_of::<SegmentHeader>() + CheckedFrame::offset_of_frame()
);
assert_eq!(
page_offset(0) as usize,
size_of::<SegmentHeader>() + CheckedFrame::offset_of_frame() + size_of::<FrameHeader>()
);
}
}

@ -191,10 +191,6 @@ impl<F: FileExt> SealedSegment<F> {
let index_offset = header.index_offset.get();
let index_len = header.index_size.get();
if !header.flags().contains(SegmentFlags::SEALED) {
todo!("recover");
}
if header.is_empty() {
std::fs::remove_file(path)?;
return Ok(None);

@ -164,18 +164,22 @@ impl<IO: Io> SharedWal<IO> {
let next_offset = current.count_committed() as u32;
let next_frame_no = current.next_frame_no().get();
*tx_id_lock = Some(read_tx.id);
let current_checksum = current.current_checksum();
Ok(WriteTransaction {
wal_lock: self.wal_lock.clone(),
savepoints: vec![Savepoint {
current_checksum,
next_offset,
next_frame_no,
index: BTreeMap::new(),
}],
next_frame_no,
next_offset,
current_checksum,
is_commited: false,
read_tx: read_tx.clone(),
recompute_checksum: None,
})
}

@ -104,6 +104,7 @@ impl<F> Drop for ReadTransaction<F> {
pub struct Savepoint {
pub next_offset: u32,
pub next_frame_no: u64,
pub current_checksum: u32,
pub index: BTreeMap<u32, u32>,
}
@ -125,8 +126,12 @@ pub struct WriteTransaction<F> {
pub savepoints: Vec<Savepoint>,
pub next_frame_no: u64,
pub next_offset: u32,
pub current_checksum: u32,
pub is_commited: bool,
pub read_tx: ReadTransaction<F>,
/// if transaction overwrote frames, then the running checksum needs to be recomputed.
/// We store here the lowest segment offset at which a frame was overwritten
pub recompute_checksum: Option<u32>,
}
pub struct TxGuard<'a, F> {
@ -160,6 +165,7 @@ impl<F> WriteTransaction<F> {
next_offset: self.next_offset,
next_frame_no: self.next_frame_no,
index: BTreeMap::new(),
current_checksum: self.current_checksum,
});
savepoint_id
}
@ -264,6 +270,10 @@ impl<F> WriteTransaction<F> {
pub(crate) fn commit(&mut self) {
self.is_commited = true;
}
pub(crate) fn current_checksum(&self) -> u32 {
self.savepoints.last().unwrap().current_checksum
}
}
impl<F> Deref for WriteTransaction<F> {