diff --git a/.gitignore b/.gitignore index ca739ab20..ae9a3bdcb 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ target/ .fleet/ perf.data* -.scratch \ No newline at end of file +.scratch + +.DS_Store \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index c12e87bba..d610a90b2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,9 +95,9 @@ checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" [[package]] name = "arc-swap" -version = "1.7.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b3d0060af21e8d11a926981cc00c6c1541aa91dd64b9f881985c3da1094425f" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" [[package]] name = "argh" @@ -118,7 +118,7 @@ dependencies = [ "argh_shared", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -171,9 +171,9 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" [[package]] name = "backtrace" @@ -196,6 +196,12 @@ version = "0.21.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" +[[package]] +name = "base64" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9475866fec1451be56a3c2400fd081ff546538961565ccb5b7142cbd22bc7a51" + [[package]] name = "base64ct" version = "1.6.0" @@ -229,7 +235,7 @@ dependencies = [ "regex", "rustc-hash", "shlex", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -274,6 +280,15 @@ dependencies = [ "serde", ] +[[package]] +name = "bytesize" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e368af43e418a04d52505cf3dbc23dda4e3407ae2fa99fd0e4f308ce546acc" +dependencies = [ + "serde", +] + [[package]] name = "bzip2-sys" version = "0.1.11+1.0.8" @@ -362,9 +377,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.3" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949626d00e063efc93b6dca932419ceb5432f99769911c0b995f7e884c778813" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", "clap_derive", @@ -384,14 +399,14 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.3" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90239a040c80f5e14809ca132ddc4176ab33d5e17e49691793296e3fcb34d72f" +checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -412,6 +427,12 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + [[package]] name = "cpufeatures" version = "0.2.12" @@ -454,6 +475,16 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + [[package]] name = "crossbeam-epoch" version = "0.9.18" @@ -503,7 +534,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -576,7 +607,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -589,6 +620,18 @@ dependencies = [ "signature", ] +[[package]] +name = "either" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.8" @@ -629,7 +672,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff3c058b07bdb5414da10bc8a2489715e31b0c3f4274a213c1a23831e9d94e91" dependencies = [ "ahash", - "base64", + "base64 0.21.7", "bitflags 2.5.0", "crc32c", "everscale-crypto", @@ -651,7 +694,7 @@ checksum = "323d8b61c76be2c16eb2d72d007f1542fdeb3760fdf2e2cae219fc0da3db0c09" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -665,9 +708,19 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.1" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" + +[[package]] +name = "fdlimit" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e182f7dbc2ef73d9ef67351c5fbbea084729c48362d3ce9dd44c28e32e277fe5" +dependencies = [ + "libc", + "thiserror", +] [[package]] name = "fiat-crypto" @@ -689,7 +742,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -784,9 +837,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "itoa" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" @@ -905,9 +958,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "minimal-lexical" @@ -966,6 +1019,15 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "ntapi" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" +dependencies = [ + "winapi", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -1086,7 +1148,7 @@ version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" dependencies = [ - "base64", + "base64 0.21.7", "serde", ] @@ -1121,7 +1183,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1165,9 +1227,9 @@ checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "platforms" -version = "3.3.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626dec3cac7cc0e1577a2ec3fc496277ec2baa084bebad95bb6fdbfae235f84c" +checksum = "db23d408679286588f4d4644f965003d056e3dd5abcaaa938116871d7ce2fee7" [[package]] name = "powerfmt" @@ -1183,12 +1245,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "prettyplease" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1226,6 +1288,18 @@ dependencies = [ "winapi", ] +[[package]] +name = "quick_cache" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1380629287ed1247c1e0fcc6d43efdcec508b65382c9ab775cc8f3df7ca07b0" +dependencies = [ + "ahash", + "equivalent", + "hashbrown", + "parking_lot", +] + [[package]] name = "quinn" version = "0.10.2" @@ -1321,6 +1395,26 @@ dependencies = [ "bitflags 2.5.0", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + [[package]] name = "rcgen" version = "0.11.3" @@ -1351,7 +1445,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata 0.4.6", - "regex-syntax 0.8.2", + "regex-syntax 0.8.3", ] [[package]] @@ -1371,7 +1465,7 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.2", + "regex-syntax 0.8.3", ] [[package]] @@ -1382,9 +1476,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "ring" @@ -1416,6 +1510,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rlimit" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8" +dependencies = [ + "libc", +] + [[package]] name = "rocksdb" version = "0.21.0" @@ -1554,14 +1657,14 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] name = "serde_json" -version = "1.0.114" +version = "1.0.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" +checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" dependencies = [ "itoa", "ryu", @@ -1594,6 +1697,15 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + [[package]] name = "signature" version = "2.2.0" @@ -1665,6 +1777,12 @@ dependencies = [ "der", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "strsim" version = "0.11.0" @@ -1690,9 +1808,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.53" +version = "2.0.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7383cd0e49fff4b6b90ca5670bfd3e9d6a733b3f90c686605aa7eec8c4996032" +checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0" dependencies = [ "proc-macro2", "quote", @@ -1711,6 +1829,21 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "sysinfo" +version = "0.30.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c385888ef380a852a16209afc8cfad22795dd8873d69c9a14d2e2088f118d18" +dependencies = [ + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "windows", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -1746,7 +1879,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1817,9 +1950,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tl-proto" -version = "0.4.4" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3418163db528cc2324ed7bc9d52aa3ca7a8f73d685f8b21b319d2a08ee4b36d3" +checksum = "d4da430e55186abb18b4d1457a23eb0765af0dee66a9f741d652d6eaa476a8d7" dependencies = [ "bytes", "digest", @@ -1831,14 +1964,14 @@ dependencies = [ [[package]] name = "tl-proto-proc" -version = "0.4.3" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3f51de4865e0618b187c2b790c137de938d01fe5510116b959387b6133c20ac" +checksum = "5a3eaf6822a3ce34a40564dd3078a915d35c3c5fd1f6b3d81eab991e6d00a0fb" dependencies = [ "proc-macro2", "quote", "rustc-hash", - "syn 2.0.53", + "syn 2.0.55", "tl-scheme", ] @@ -1857,16 +1990,18 @@ dependencies = [ [[package]] name = "tokio" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", "libc", "mio", "num_cpus", + "parking_lot", "pin-project-lite", + "signal-hook-registry", "socket2", "tokio-macros", "windows-sys 0.48.0", @@ -1880,7 +2015,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1928,7 +2063,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1998,6 +2133,10 @@ name = "triomphe" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3" +dependencies = [ + "serde", + "stable_deref_trait", +] [[package]] name = "tycho-block-util" @@ -2060,7 +2199,7 @@ dependencies = [ "anyhow", "arc-swap", "argh", - "base64", + "base64 0.21.7", "bytes", "castaway", "dashmap", @@ -2110,7 +2249,40 @@ dependencies = [ name = "tycho-storage" version = "0.0.1" dependencies = [ + "anyhow", + "arc-swap", + "base64 0.22.0", + "bumpalo", + "bytes", + "bytesize", + "crc", + "dashmap", + "everscale-types", + "fdlimit", + "hex", + "humantime", + "libc", + "num-traits", + "parking_lot", + "parking_lot_core", + "quick_cache", + "rlimit", + "serde", + "serde_json", + "sha2", + "smallvec", + "sysinfo", + "tempfile", + "thiserror", + "tokio", + "tracing", + "tracing-appender", + "tracing-subscriber", + "tracing-test", + "triomphe", + "tycho-block-util", "tycho-util", + "weedb", ] [[package]] @@ -2123,10 +2295,12 @@ dependencies = [ "futures-util", "hex", "humantime", + "libc", "rand", "serde", "thiserror", "tokio", + "tracing", ] [[package]] @@ -2244,7 +2418,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", "wasm-bindgen-shared", ] @@ -2266,7 +2440,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2330,6 +2504,25 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.4", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.4", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -2505,7 +2698,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -2516,9 +2709,9 @@ checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" [[package]] name = "zstd-sys" -version = "2.0.9+zstd.1.5.5" +version = "2.0.10+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e16efa8a874a0481a574084d34cc26fdb3b99627480f785888deb6386506656" +checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" dependencies = [ "cc", "pkg-config", diff --git a/core/Cargo.toml b/core/Cargo.toml index d9fab0eff..6c6638331 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -13,4 +13,4 @@ tycho-storage = { path = "../storage", version = "=0.0.1" } tycho-util = { path = "../util", version = "=0.0.1" } [lints] -workspace= true \ No newline at end of file +workspace= true diff --git a/storage/Cargo.toml b/storage/Cargo.toml index 6eaad11a3..d865f5b6f 100644 --- a/storage/Cargo.toml +++ b/storage/Cargo.toml @@ -2,13 +2,47 @@ name = "tycho-storage" version = "0.0.1" edition = "2021" -description = "A unified storage interface." + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -# crates.io deps +anyhow = "1.0.79" +arc-swap = "1.6.0" +bumpalo = "3.14.0" +bytes = "1.5.0" +bytesize = { version = "1.3.0", features = ["serde"] } +crc = "3.0.1" +dashmap = "5.5.3" +everscale-types = "0.1.0-rc.6" +fdlimit = "0.3.0" +hex = "0.4.3" +humantime = "2.1.0" +libc = "0.2.153" +num-traits = "0.2.18" +parking_lot = "0.12.1" +parking_lot_core = "0.9.9" +quick_cache = "0.4.1" +rlimit = "0.10.1" +serde = { version = "1.0.196", features = ["derive"] } +sha2 = "0.10.8" +smallvec = "1.13.1" +sysinfo = "0.30.5" +thiserror = "1.0.57" +tokio = { version = "1.36.0", features = ["full"] } +tracing = "0.1" +triomphe = "0.1.11" +weedb = "0.1.1" + +tycho-block-util = { path = "../block-util" } +tycho-util = { path = "../util" } -# local deps -tycho-util = { path = "../util", version = "=0.0.1" } +[dev-dependencies] +base64 = "0.22.0" +serde_json = "1.0.114" +tracing-appender = "0.2.3" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +tracing-test = "0.2" +tempfile = "3.10" [lints] workspace = true diff --git a/storage/src/db/file_db/mapped_file.rs b/storage/src/db/file_db/mapped_file.rs new file mode 100644 index 000000000..4594397aa --- /dev/null +++ b/storage/src/db/file_db/mapped_file.rs @@ -0,0 +1,98 @@ +use std::fs::File; +use std::os::fd::AsRawFd; +use std::path::Path; + +/// Memory buffer that is mapped to a file +pub struct MappedFile { + file: File, + length: usize, + ptr: *mut libc::c_void, +} + +impl MappedFile { + /// Opens a file and maps it to memory. Resizes the file to `length` bytes. + pub fn new>(path: P, length: usize) -> std::io::Result { + let file = std::fs::OpenOptions::new() + .write(true) + .read(true) + .truncate(true) + .create(true) + .open(path)?; + + file.set_len(length as u64)?; + + Self::from_existing_file(file) + } + + /// Opens an existing file and maps it to memory + pub fn from_existing_file(file: File) -> std::io::Result { + let length = file.metadata()?.len() as usize; + + // SAFETY: File was opened successfully, file mode is RW, offset is aligned + let ptr = unsafe { + libc::mmap( + std::ptr::null_mut(), + length, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_SHARED, + file.as_raw_fd(), + 0, + ) + }; + + if ptr == libc::MAP_FAILED { + return Err(std::io::Error::last_os_error()); + } + + if unsafe { libc::madvise(ptr, length, libc::MADV_RANDOM) } != 0 { + return Err(std::io::Error::last_os_error()); + } + + Ok(Self { file, length, ptr }) + } + + /// Mapped buffer length in bytes + pub fn length(&self) -> usize { + self.length + } + + /// Copies chunk of bytes to the specified buffer + /// + /// # Safety + /// The caller must take care that the buffer is not out of the mapped memory! + pub unsafe fn read_exact_at(&self, offset: usize, buffer: &mut [u8]) { + std::ptr::copy_nonoverlapping( + (self.ptr as *const u8).add(offset), + buffer.as_mut_ptr(), + buffer.len(), + ); + } + + /// Copies buffer to the mapped memory + /// + /// # Safety + /// The caller must take care that the buffer is not out of the mapped memory! + pub unsafe fn write_all_at(&self, offset: usize, buffer: &[u8]) { + std::ptr::copy_nonoverlapping( + buffer.as_ptr(), + (self.ptr.cast::()).add(offset), + buffer.len(), + ); + } +} + +impl Drop for MappedFile { + fn drop(&mut self) { + // SAFETY: File still exists, ptr and length were initialized once on creation + if unsafe { libc::munmap(self.ptr, self.length) } != 0 { + // TODO: how to handle this? + panic!("failed to unmap file: {}", std::io::Error::last_os_error()); + } + + let _ = self.file.set_len(0); + let _ = self.file.sync_all(); + } +} + +unsafe impl Send for MappedFile {} +unsafe impl Sync for MappedFile {} diff --git a/storage/src/db/file_db/mod.rs b/storage/src/db/file_db/mod.rs new file mode 100644 index 000000000..be5c69323 --- /dev/null +++ b/storage/src/db/file_db/mod.rs @@ -0,0 +1,149 @@ +use std::fs::{File, OpenOptions}; +use std::os::fd::AsRawFd; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +pub use self::mapped_file::MappedFile; +pub use self::temp_file::TempFile; + +mod mapped_file; +mod temp_file; + +#[derive(Clone)] +pub struct FileDb(Arc); + +impl FileDb { + pub fn new

(root: P) -> Self + where + P: AsRef, + { + Self(Arc::new(FileDbInner { + base_dir: root.as_ref().to_path_buf(), + })) + } + + pub fn path(&self) -> &Path { + &self.0.base_dir + } + + pub fn ensure_exists(&self) -> std::io::Result<()> { + std::fs::create_dir_all(&self.0.base_dir) + } + + pub fn create_dir_all>(&self, rel_path: P) -> std::io::Result<()> { + std::fs::create_dir_all(self.0.base_dir.join(rel_path)) + } + + pub fn remove_file>(&self, rel_path: P) -> std::io::Result<()> { + std::fs::remove_file(self.0.base_dir.join(rel_path)) + } + + pub fn file>(&self, rel_path: P) -> FileBuilder { + FileBuilder { + path: self.0.base_dir.join(rel_path.as_ref()), + options: std::fs::OpenOptions::new(), + prealloc: None, + } + } + + pub fn subdir>(&self, rel_path: P) -> Self { + Self(Arc::new(FileDbInner { + base_dir: self.0.base_dir.join(rel_path), + })) + } + + pub fn file_exists>(&self, rel_path: P) -> bool { + self.path().join(rel_path).is_file() + } + + pub fn entries(&self) -> std::io::Result { + std::fs::read_dir(&self.0.base_dir) + } +} + +struct FileDbInner { + base_dir: PathBuf, +} + +pub struct FileBuilder { + path: PathBuf, + options: OpenOptions, + prealloc: Option, +} + +impl FileBuilder { + pub fn open(&self) -> std::io::Result { + let file = self.options.open(&self.path)?; + if let Some(prealloc) = self.prealloc { + alloc_file(&file, prealloc)?; + } + Ok(file) + } + + pub fn open_as_temp(&self) -> std::io::Result { + let file = self.open()?; + Ok(TempFile::new(self.path.clone(), file)) + } + + pub fn open_as_mapped(&self) -> std::io::Result { + match self.prealloc { + Some(length) => MappedFile::new(&self.path, length), + None => MappedFile::from_existing_file(self.open()?), + } + } + + pub fn append(&mut self, append: bool) -> &mut Self { + self.options.append(append); + self + } + + pub fn create(&mut self, create: bool) -> &mut Self { + self.options.create(create); + self + } + + pub fn create_new(&mut self, create_new: bool) -> &mut Self { + self.options.create_new(create_new); + self + } + + pub fn read(&mut self, read: bool) -> &mut Self { + self.options.read(read); + self + } + + pub fn truncate(&mut self, truncate: bool) -> &mut Self { + self.options.truncate(truncate); + self + } + + pub fn write(&mut self, write: bool) -> &mut Self { + self.options.write(write); + self + } + + pub fn prealloc(&mut self, prealloc: usize) -> &mut Self { + self.prealloc = Some(prealloc); + self + } +} + +#[cfg(not(target_os = "macos"))] +fn alloc_file(file: &File, len: usize) -> std::io::Result<()> { + let res = unsafe { libc::posix_fallocate(file.as_raw_fd(), 0, len as i64) }; + if res == 0 { + Ok(()) + } else { + Err(std::io::Error::last_os_error()) + } +} + +#[cfg(target_os = "macos")] +pub fn alloc_file(file: &File, len: usize) -> std::io::Result<()> { + let res = unsafe { libc::ftruncate(file.as_raw_fd(), len as i64) }; + if res < 0 { + Err(std::io::Error::last_os_error()) + } else { + Ok(()) + } +} diff --git a/storage/src/db/file_db/temp_file.rs b/storage/src/db/file_db/temp_file.rs new file mode 100644 index 000000000..afa233b73 --- /dev/null +++ b/storage/src/db/file_db/temp_file.rs @@ -0,0 +1,67 @@ +use std::fs::File; +use std::mem::ManuallyDrop; +use std::path::PathBuf; + +pub struct TempFile { + file: ManuallyDrop, + file_path: Option, +} + +impl TempFile { + pub fn new(path: PathBuf, file: File) -> Self { + Self { + file: ManuallyDrop::new(file), + file_path: Some(path), + } + } + + pub fn disarm(mut self) -> File { + self.file_path = None; + + // SAFETY: File will not be dropped as `file_path` is `None`. + unsafe { ManuallyDrop::take(&mut self.file) } + } +} + +impl AsRef for TempFile { + #[inline] + fn as_ref(&self) -> &File { + &self.file + } +} + +impl AsMut for TempFile { + #[inline] + fn as_mut(&mut self) -> &mut File { + &mut self.file + } +} + +impl std::ops::Deref for TempFile { + type Target = File; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.file + } +} + +impl std::ops::DerefMut for TempFile { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.file + } +} + +impl Drop for TempFile { + fn drop(&mut self) { + if let Some(file_path) = self.file_path.take() { + // SAFETY: File will only be dropped once. + unsafe { ManuallyDrop::drop(&mut self.file) }; + + if let Err(e) = std::fs::remove_file(&file_path) { + tracing::error!(path = %file_path.display(), "failed to remove file: {e:?}"); + } + } + } +} diff --git a/storage/src/db/kv_db/config.rs b/storage/src/db/kv_db/config.rs new file mode 100644 index 000000000..bd2d53b2d --- /dev/null +++ b/storage/src/db/kv_db/config.rs @@ -0,0 +1,55 @@ +use bytesize::ByteSize; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields, default)] +pub struct DbOptions { + pub rocksdb_lru_capacity: ByteSize, + pub cells_cache_size: ByteSize, +} + +impl Default for DbOptions { + fn default() -> Self { + // Fetch the currently available memory in bytes + let available = { + let mut sys = sysinfo::System::new(); + sys.refresh_memory(); + sys.available_memory() + }; + + // Estimated memory usage of components other than cache: + // - 2 GiBs for write buffers(4 if we are out of luck and all memtables are being flushed at the same time) + // - 2 GiBs for indexer logic + // - 10 bits per cell for bloom filter. Realistic case is 100M cells, so 0.25 GiBs + // - 1/3 of all available memory is reserved for kernel buffers + const WRITE_BUFFERS: ByteSize = ByteSize::gib(2); + const INDEXER_LOGIC: ByteSize = ByteSize::gib(2); + const BLOOM_FILTER: ByteSize = ByteSize::mib(256); + let estimated_memory_usage = WRITE_BUFFERS + INDEXER_LOGIC + BLOOM_FILTER + available / 3; + + // Reduce the available memory by the fixed offset + let available = available + .checked_sub(estimated_memory_usage.as_u64()) + .unwrap_or_else(|| { + tracing::error!( + "Not enough memory for cache, using 1/4 of all available memory. \ + Tweak `db_options` in config to improve performance." + ); + available / 4 + }); + + // We will use 3/4 of available memory for the cells cache (at most 4 GB). + let cells_cache_size = std::cmp::min(ByteSize(available * 4 / 3), ByteSize::gib(4)); + + // The reset of the memory is used for LRU cache (at least 128 MB) + let rocksdb_lru_capacity = std::cmp::max( + ByteSize(available.saturating_sub(cells_cache_size.as_u64())), + ByteSize::mib(128), + ); + + Self { + rocksdb_lru_capacity, + cells_cache_size, + } + } +} diff --git a/storage/src/db/kv_db/migrations/mod.rs b/storage/src/db/kv_db/migrations/mod.rs new file mode 100644 index 000000000..17a01170f --- /dev/null +++ b/storage/src/db/kv_db/migrations/mod.rs @@ -0,0 +1,50 @@ +use anyhow::Result; +use weedb::{Migrations, Semver, VersionProvider, WeeDb}; + +use super::tables; + +// declare migrations here as `mod v2_1_x` + +const CURRENT_VERSION: Semver = [2, 1, 0]; + +pub fn apply(db: &WeeDb) -> Result<()> { + let migrations = + Migrations::with_target_version_and_provider(CURRENT_VERSION, NodeStateVersionProvider); + + // === register migrations here === + // v2_1_1::register(&mut migrations).context("Failed to register migrations")?; + + db.apply(migrations)?; + Ok(()) +} + +struct NodeStateVersionProvider; + +impl NodeStateVersionProvider { + const DB_VERSION_KEY: &'static str = "db_version"; +} + +impl VersionProvider for NodeStateVersionProvider { + fn get_version(&self, db: &weedb::WeeDb) -> Result, weedb::Error> { + let state = db.instantiate_table::(); + + let value = state.get(Self::DB_VERSION_KEY)?; + match value { + Some(version) => { + let slice = version.as_ref(); + slice + .try_into() + .map_err(|_e| weedb::Error::InvalidDbVersion) + .map(Some) + } + None => Ok(None), + } + } + + fn set_version(&self, db: &weedb::WeeDb, version: Semver) -> Result<(), weedb::Error> { + let state = db.instantiate_table::(); + + state.insert(Self::DB_VERSION_KEY, version)?; + Ok(()) + } +} diff --git a/storage/src/db/kv_db/mod.rs b/storage/src/db/kv_db/mod.rs new file mode 100644 index 000000000..aeed753fd --- /dev/null +++ b/storage/src/db/kv_db/mod.rs @@ -0,0 +1,250 @@ +use std::path::PathBuf; +use std::sync::Arc; +use std::thread::available_parallelism; + +use anyhow::{Context, Result}; +use bytesize::ByteSize; +use weedb::{Caches, WeeDb}; + +pub use weedb::Stats as RocksdbStats; +pub use weedb::{rocksdb, BoundedCfHandle, ColumnFamily, Table}; + +pub use self::config::DbOptions; + +pub mod refcount; +pub mod tables; + +mod config; +mod migrations; + +pub struct Db { + pub archives: Table, + pub block_handles: Table, + pub key_blocks: Table, + pub package_entries: Table, + pub shard_states: Table, + pub cells: Table, + pub node_states: Table, + pub prev1: Table, + pub prev2: Table, + pub next1: Table, + pub next2: Table, + + compaction_lock: tokio::sync::RwLock<()>, + inner: WeeDb, +} + +impl Db { + pub fn open(path: PathBuf, options: DbOptions) -> Result> { + tracing::info!( + rocksdb_lru_capacity = %options.rocksdb_lru_capacity, + cells_cache_size = %options.cells_cache_size, + "opening DB" + ); + + let limit = match fdlimit::raise_fd_limit() { + // New fd limit + Ok(fdlimit::Outcome::LimitRaised { to, .. }) => to, + // Current soft limit + _ => { + rlimit::getrlimit(rlimit::Resource::NOFILE) + .unwrap_or((256, 0)) + .0 + } + }; + + let caches_capacity = + std::cmp::max(options.rocksdb_lru_capacity, ByteSize::mib(256)).as_u64() as usize; + + let caches = Caches::with_capacity(caches_capacity); + let threads = available_parallelism()?.get(); + + let inner = WeeDb::builder(path, caches) + .options(|opts, _| { + opts.set_paranoid_checks(false); + + // bigger base level size - less compactions + // parallel compactions finishes faster - less write stalls + + opts.set_max_subcompactions(threads as u32 / 2); + + // io + opts.set_max_open_files(limit as i32); + + // logging + opts.set_log_level(rocksdb::LogLevel::Info); + opts.set_keep_log_file_num(2); + opts.set_recycle_log_file_num(2); + + // cf + opts.create_if_missing(true); + opts.create_missing_column_families(true); + + // cpu + opts.set_max_background_jobs(std::cmp::max((threads as i32) / 2, 2)); + opts.increase_parallelism(threads as i32); + + opts.set_allow_concurrent_memtable_write(false); + opts.set_enable_write_thread_adaptive_yield(true); + + // debug + // NOTE: could slower everything a bit in some cloud environments. + // See: https://github.com/facebook/rocksdb/issues/3889 + // + // opts.enable_statistics(); + // opts.set_stats_dump_period_sec(600); + }) + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .with_table::() + .build() + .context("Failed building db")?; + + migrations::apply(&inner).context("Failed to apply migrations")?; + + Ok(Arc::new(Self { + archives: inner.instantiate_table(), + block_handles: inner.instantiate_table(), + key_blocks: inner.instantiate_table(), + package_entries: inner.instantiate_table(), + shard_states: inner.instantiate_table(), + cells: inner.instantiate_table(), + node_states: inner.instantiate_table(), + prev1: inner.instantiate_table(), + prev2: inner.instantiate_table(), + next1: inner.instantiate_table(), + next2: inner.instantiate_table(), + compaction_lock: tokio::sync::RwLock::default(), + inner, + })) + } + + #[inline] + pub fn raw(&self) -> &Arc { + self.inner.raw() + } + + pub fn get_memory_usage_stats(&self) -> Result { + self.inner.get_memory_usage_stats().map_err(From::from) + } + + pub async fn delay_compaction(&self) -> tokio::sync::RwLockReadGuard<'_, ()> { + self.compaction_lock.read().await + } + + pub async fn trigger_compaction(&self) { + use std::time::Instant; + + let _compaction_guard = self.compaction_lock.write().await; + + let tables = [ + (self.block_handles.cf(), "block handles"), + (self.package_entries.cf(), "package entries"), + (self.archives.cf(), "archives"), + (self.shard_states.cf(), "shard states"), + (self.cells.cf(), "cells"), + ]; + + for (cf, title) in tables { + tracing::info!("{title} compaction started"); + + let instant = Instant::now(); + + let bound = Option::<[u8; 0]>::None; + self.raw().compact_range_cf(&cf, bound, bound); + + tracing::info!( + elapsed = %humantime::format_duration(instant.elapsed()), + "{title} compaction finished" + ); + } + } + + pub fn get_disk_usage(&self) -> Result> { + use std::thread; + + fn get_table_stats(db: &WeeDb) -> (ByteSize, ByteSize) { + let cf = db.instantiate_table::(); + let res: (usize, usize) = cf + .iterator(rocksdb::IteratorMode::Start) + .filter_map(|x| { + let x = match x { + Ok(x) => x, + Err(e) => { + tracing::error!("Error while iterating: {}", e); + return None; + } + }; + Some((x.0.len(), x.1.len())) + }) + .fold((0, 0), |acc, x| (acc.0 + x.0, acc.1 + x.1)); + + (ByteSize(res.0 as u64), ByteSize(res.1 as u64)) + } + + macro_rules! stats { + ($spawner:expr, $( $x:ident => $table:ty ),* ) => {{ + $( + let $x = $spawner.spawn(|| get_table_stats::<$table>(&self.inner)); + )* + stats!($($x),*) + } + }; + ( $( $x:ident),* ) => { + { + let mut temp_vec = Vec::new(); + $( + temp_vec.push({ + let $x = $x.join().map_err(|_|anyhow::anyhow!("Join error"))?; + DiskUsageInfo { + cf_name: stringify!($x).to_string(), + keys_total: $x.0, + values_total: $x.1, + } + }); + )* + return Ok(temp_vec) + } + }; + } + + let stats = thread::scope(|s| -> Result> { + stats!(s, + archives => tables::Archives, + block_handles => tables::BlockHandles, + key_blocks => tables::KeyBlocks, + package_entries => tables::PackageEntries, + shard_states => tables::ShardStates, + cells => tables::Cells, + node_states => tables::NodeStates, + prev1 => tables::Prev1, + prev2 => tables::Prev2, + next1 => tables::Next1, + next2 => tables::Next2 + ) + })?; + + Ok(stats) + } +} + +#[derive(Debug, Clone)] +pub struct DiskUsageInfo { + pub cf_name: String, + pub keys_total: ByteSize, + pub values_total: ByteSize, +} + +impl Drop for Db { + fn drop(&mut self) { + self.raw().cancel_all_background_work(true); + } +} diff --git a/storage/src/db/kv_db/refcount.rs b/storage/src/db/kv_db/refcount.rs new file mode 100644 index 000000000..a9fbb800d --- /dev/null +++ b/storage/src/db/kv_db/refcount.rs @@ -0,0 +1,85 @@ +use std::cmp::Ordering; +use std::convert::TryInto; + +use weedb::rocksdb; +use weedb::rocksdb::compaction_filter::Decision; + +pub fn merge_operator( + _key: &[u8], + existing: Option<&[u8]>, + operands: &rocksdb::MergeOperands, +) -> Option> { + let (mut rc, mut payload) = existing.map_or((0, None), decode_value_with_rc); + for (delta, new_payload) in operands.into_iter().map(decode_value_with_rc) { + if payload.is_none() && delta > 0 { + payload = new_payload; + } + rc += delta; + } + + Some(match rc.cmp(&0) { + Ordering::Less => rc.to_le_bytes().to_vec(), + Ordering::Equal => Vec::new(), + Ordering::Greater => { + let payload = payload.unwrap_or(&[]); + let mut result = Vec::with_capacity(RC_BYTES + payload.len()); + result.extend_from_slice(&rc.to_le_bytes()); + result.extend_from_slice(payload); + result + } + }) +} + +pub fn compaction_filter(_level: u32, _key: &[u8], value: &[u8]) -> Decision { + if value.is_empty() { + Decision::Remove + } else { + Decision::Keep + } +} + +pub fn decode_value_with_rc(bytes: &[u8]) -> (RcType, Option<&[u8]>) { + let without_payload = match bytes.len().cmp(&RC_BYTES) { + std::cmp::Ordering::Greater => false, + std::cmp::Ordering::Equal => true, + std::cmp::Ordering::Less => return (0, None), + }; + + let rc = RcType::from_le_bytes(bytes[..RC_BYTES].try_into().unwrap()); + if rc <= 0 || without_payload { + (rc, None) + } else { + (rc, Some(&bytes[RC_BYTES..])) + } +} + +// will be use in persistent storage writer +pub fn strip_refcount(bytes: &[u8]) -> Option<&[u8]> { + if bytes.len() < RC_BYTES { + return None; + } + if RcType::from_le_bytes(bytes[..RC_BYTES].try_into().unwrap()) > 0 { + Some(&bytes[RC_BYTES..]) + } else { + None + } +} + +pub fn add_positive_refount(rc: u32, data: Option<&[u8]>, target: &mut Vec) { + target.extend_from_slice(&RcType::from(rc).to_le_bytes()); + if let Some(data) = data { + target.extend_from_slice(data); + } +} + +pub fn encode_positive_refcount(rc: u32) -> [u8; RC_BYTES] { + RcType::from(rc).to_le_bytes() +} + +pub fn encode_negative_refcount(rc: u32) -> [u8; RC_BYTES] { + (-RcType::from(rc)).to_le_bytes() +} + +type RcType = i64; + +const RC_BYTES: usize = std::mem::size_of::(); diff --git a/storage/src/db/kv_db/tables.rs b/storage/src/db/kv_db/tables.rs new file mode 100644 index 000000000..0d588c972 --- /dev/null +++ b/storage/src/db/kv_db/tables.rs @@ -0,0 +1,293 @@ +use bytesize::ByteSize; +use weedb::rocksdb::{ + BlockBasedIndexType, BlockBasedOptions, DBCompressionType, DataBlockIndexType, MergeOperands, + Options, ReadOptions, +}; +use weedb::{rocksdb, Caches, ColumnFamily}; + +use super::refcount; + +/// Stores prepared archives +/// - Key: `u32 (BE)` (archive id) +/// - Value: `Vec` (archive data) +pub struct Archives; +impl ColumnFamily for Archives { + const NAME: &'static str = "archives"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + optimize_for_level_compaction(opts, ByteSize::mib(512u64)); + + opts.set_merge_operator_associative("archive_data_merge", archive_data_merge); + opts.set_compression_type(DBCompressionType::Zstd); + } +} + +/// Maps block root hash to block meta +/// - Key: `[u8; 32]` +/// - Value: `BlockMeta` +pub struct BlockHandles; +impl ColumnFamily for BlockHandles { + const NAME: &'static str = "block_handles"; + + fn options(opts: &mut Options, caches: &Caches) { + optimize_for_level_compaction(opts, ByteSize::mib(512u64)); + + let mut block_factory = BlockBasedOptions::default(); + block_factory.set_block_cache(&caches.block_cache); + + block_factory.set_index_type(BlockBasedIndexType::HashSearch); + block_factory.set_data_block_index_type(DataBlockIndexType::BinaryAndHash); + block_factory.set_format_version(5); + + opts.set_block_based_table_factory(&block_factory); + optimize_for_point_lookup(opts, caches); + } + + fn read_options(opts: &mut ReadOptions) { + opts.set_verify_checksums(false); + } +} + +/// Maps seqno to key block id +/// - Key: `u32 (BE)` +/// - Value: `ton_block::BlockIdExt` +pub struct KeyBlocks; +impl ColumnFamily for KeyBlocks { + const NAME: &'static str = "key_blocks"; + + fn read_options(opts: &mut ReadOptions) { + opts.set_verify_checksums(false); + } +} + +/// Maps package entry id to entry data +/// - Key: `BlockIdShort (16 bytes), [u8; 32], package type (1 byte)` +/// - Value: `Vec` +pub struct PackageEntries; +impl ColumnFamily for PackageEntries { + const NAME: &'static str = "package_entries"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + opts.set_compression_type(DBCompressionType::Zstd); + + // This flag specifies that the implementation should optimize the filters + // mainly for cases where keys are found rather than also optimize for keys + // missed. This would be used in cases where the application knows that + // there are very few misses or the performance in the case of misses is not + // important. + // + // For now, this flag allows us to not store filters for the last level i.e + // the largest level which contains data of the LSM store. For keys which + // are hits, the filters in this level are not useful because we will search + // for the data anyway. NOTE: the filters in other levels are still useful + // even for key hit because they tell us whether to look in that level or go + // to the higher level. + // https://github.com/facebook/rocksdb/blob/81aeb15988e43c49952c795e32e5c8b224793589/include/rocksdb/advanced_options.h#L846 + opts.set_optimize_filters_for_hits(true); + } +} + +/// Maps `BlockId` to root cell hash +/// - Key: `BlockId` +/// - Value: `[u8; 32]` +pub struct ShardStates; +impl ColumnFamily for ShardStates { + const NAME: &'static str = "shard_states"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + opts.set_compression_type(DBCompressionType::Zstd); + } +} + +/// Stores cells data +/// - Key: `[u8; 32]` (cell repr hash) +/// - Value: `StorageCell` +pub struct Cells; +impl ColumnFamily for Cells { + const NAME: &'static str = "cells"; + + fn options(opts: &mut Options, caches: &Caches) { + opts.set_level_compaction_dynamic_level_bytes(true); + + opts.set_merge_operator_associative("cell_merge", refcount::merge_operator); + opts.set_compaction_filter("cell_compaction", refcount::compaction_filter); + + optimize_for_level_compaction(opts, ByteSize::gib(1u64)); + + let mut block_factory = BlockBasedOptions::default(); + block_factory.set_block_cache(&caches.block_cache); + block_factory.set_data_block_index_type(DataBlockIndexType::BinaryAndHash); + block_factory.set_whole_key_filtering(true); + block_factory.set_checksum_type(rocksdb::ChecksumType::NoChecksum); + + block_factory.set_bloom_filter(10.0, false); + block_factory.set_block_size(16 * 1024); + block_factory.set_format_version(5); + + opts.set_block_based_table_factory(&block_factory); + opts.set_optimize_filters_for_hits(true); + // option is set for cf + opts.set_compression_type(DBCompressionType::Lz4); + } +} + +/// Stores generic node parameters +/// - Key: `...` +/// - Value: `...` +pub struct NodeStates; +impl ColumnFamily for NodeStates { + const NAME: &'static str = "node_states"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + + opts.set_optimize_filters_for_hits(true); + optimize_for_point_lookup(opts, caches); + } +} + +/// Stores connections data +/// - Key: `[u8; 32]` (block root hash) +/// - Value: `BlockId (LE)` +pub struct Prev1; +impl ColumnFamily for Prev1 { + const NAME: &'static str = "prev1"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + + optimize_for_point_lookup(opts, caches); + } + + fn read_options(opts: &mut ReadOptions) { + opts.set_verify_checksums(false); + } +} + +/// Stores connections data +/// - Key: `[u8; 32]` (block root hash) +/// - Value: `BlockId (LE)` +pub struct Prev2; +impl ColumnFamily for Prev2 { + const NAME: &'static str = "prev2"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + + optimize_for_point_lookup(opts, caches); + } + + fn read_options(opts: &mut ReadOptions) { + opts.set_verify_checksums(false); + } +} + +/// Stores connections data +/// - Key: `[u8; 32]` (block root hash) +/// - Value: `BlockId (LE)` +pub struct Next1; +impl ColumnFamily for Next1 { + const NAME: &'static str = "next1"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + + optimize_for_point_lookup(opts, caches); + } + + fn read_options(opts: &mut ReadOptions) { + opts.set_verify_checksums(false); + } +} + +/// Stores connections data +/// - Key: `[u8; 32]` (block root hash) +/// - Value: `BlockId (LE)` +pub struct Next2; +impl ColumnFamily for Next2 { + const NAME: &'static str = "next2"; + + fn options(opts: &mut Options, caches: &Caches) { + default_block_based_table_factory(opts, caches); + + optimize_for_point_lookup(opts, caches); + } + + fn read_options(opts: &mut ReadOptions) { + opts.set_verify_checksums(false); + } +} + +fn archive_data_merge( + _: &[u8], + current_value: Option<&[u8]>, + operands: &MergeOperands, +) -> Option> { + use tycho_block_util::archive::ARCHIVE_PREFIX; + + let total_len: usize = operands.iter().map(|data| data.len()).sum(); + let mut result = Vec::with_capacity(ARCHIVE_PREFIX.len() + total_len); + + result.extend_from_slice(current_value.unwrap_or(&ARCHIVE_PREFIX)); + + for data in operands { + let data = data.strip_prefix(&ARCHIVE_PREFIX).unwrap_or(data); + result.extend_from_slice(data); + } + + Some(result) +} + +fn default_block_based_table_factory(opts: &mut Options, caches: &Caches) { + opts.set_level_compaction_dynamic_level_bytes(true); + let mut block_factory = BlockBasedOptions::default(); + block_factory.set_block_cache(&caches.block_cache); + block_factory.set_format_version(5); + opts.set_block_based_table_factory(&block_factory); +} + +// setting our shared cache instead of individual caches for each cf +fn optimize_for_point_lookup(opts: &mut Options, caches: &Caches) { + // https://github.com/facebook/rocksdb/blob/81aeb15988e43c49952c795e32e5c8b224793589/options/options.cc + // BlockBasedTableOptions block_based_options; + // block_based_options.data_block_index_type = + // BlockBasedTableOptions::kDataBlockBinaryAndHash; + // block_based_options.data_block_hash_table_util_ratio = 0.75; + // block_based_options.filter_policy.reset(NewBloomFilterPolicy(10)); + // block_based_options.block_cache = + // NewLRUCache(static_cast(block_cache_size_mb * 1024 * 1024)); + // table_factory.reset(new BlockBasedTableFactory(block_based_options)); + // memtable_prefix_bloom_size_ratio = 0.02; + // memtable_whole_key_filtering = true; + // + let mut block_factory = BlockBasedOptions::default(); + block_factory.set_data_block_index_type(DataBlockIndexType::BinaryAndHash); + block_factory.set_data_block_hash_ratio(0.75); + block_factory.set_bloom_filter(10.0, false); + block_factory.set_block_cache(&caches.block_cache); + opts.set_block_based_table_factory(&block_factory); + + opts.set_memtable_prefix_bloom_ratio(0.02); + opts.set_memtable_whole_key_filtering(true); +} + +fn optimize_for_level_compaction(opts: &mut Options, budget: ByteSize) { + opts.set_write_buffer_size(budget.as_u64() as usize / 4); + // this means we'll use 50% extra memory in the worst case, but will reduce + // write stalls. + opts.set_min_write_buffer_number_to_merge(2); + // this means we'll use 50% extra memory in the worst case, but will reduce + // write stalls. + opts.set_max_write_buffer_number(6); + // start flushing L0->L1 as soon as possible. each file on level0 is + // (memtable_memory_budget / 2). This will flush level 0 when it's bigger than + // memtable_memory_budget. + opts.set_level_zero_file_num_compaction_trigger(2); + // doesn't really matter much, but we don't want to create too many files + opts.set_target_file_size_base(budget.as_u64() / 8); + // make Level1 size equal to Level0 size, so that L0->L1 compactions are fast + opts.set_max_bytes_for_level_base(budget.as_u64()); +} diff --git a/storage/src/db/mod.rs b/storage/src/db/mod.rs new file mode 100644 index 000000000..1151c28bc --- /dev/null +++ b/storage/src/db/mod.rs @@ -0,0 +1,5 @@ +pub use self::file_db::*; +pub use self::kv_db::*; + +mod file_db; +mod kv_db; diff --git a/storage/src/lib.rs b/storage/src/lib.rs index 8b1378917..e671a026c 100644 --- a/storage/src/lib.rs +++ b/storage/src/lib.rs @@ -1 +1,96 @@ +use std::path::PathBuf; +use std::sync::Arc; +pub use self::db::*; +pub use self::models::*; +pub use self::store::*; + +mod db; +mod models; +mod store; + +mod util { + pub use stored_value::*; + + mod stored_value; +} + +pub struct Storage { + runtime_storage: Arc, + block_handle_storage: Arc, + block_storage: Arc, + shard_state_storage: ShardStateStorage, + block_connection_storage: BlockConnectionStorage, + node_state_storage: NodeStateStorage, + persistent_state_storage: PersistentStateStorage, +} + +impl Storage { + pub fn new( + db: Arc, + file_db_path: PathBuf, + max_cell_cache_size_bytes: u64, + ) -> anyhow::Result> { + let files_dir = FileDb::new(file_db_path); + + let block_handle_storage = Arc::new(BlockHandleStorage::new(db.clone())); + let runtime_storage = Arc::new(RuntimeStorage::new(block_handle_storage.clone())); + let block_storage = Arc::new(BlockStorage::new(db.clone(), block_handle_storage.clone())?); + let shard_state_storage = ShardStateStorage::new( + db.clone(), + &files_dir, + block_handle_storage.clone(), + block_storage.clone(), + max_cell_cache_size_bytes, + )?; + let persistent_state_storage = + PersistentStateStorage::new(db.clone(), &files_dir, block_handle_storage.clone())?; + let node_state_storage = NodeStateStorage::new(db.clone()); + let block_connection_storage = BlockConnectionStorage::new(db); + + Ok(Arc::new(Self { + block_handle_storage, + block_storage, + shard_state_storage, + persistent_state_storage, + block_connection_storage, + node_state_storage, + runtime_storage, + })) + } + + #[inline] + pub fn runtime_storage(&self) -> &RuntimeStorage { + &self.runtime_storage + } + + #[inline] + pub fn persistent_state_storage(&self) -> &PersistentStateStorage { + &self.persistent_state_storage + } + + #[inline] + pub fn block_handle_storage(&self) -> &BlockHandleStorage { + &self.block_handle_storage + } + + #[inline] + pub fn block_storage(&self) -> &BlockStorage { + &self.block_storage + } + + #[inline] + pub fn block_connection_storage(&self) -> &BlockConnectionStorage { + &self.block_connection_storage + } + + #[inline] + pub fn shard_state_storage(&self) -> &ShardStateStorage { + &self.shard_state_storage + } + + #[inline] + pub fn node_state(&self) -> &NodeStateStorage { + &self.node_state_storage + } +} diff --git a/storage/src/models/block_handle.rs b/storage/src/models/block_handle.rs new file mode 100644 index 000000000..254584e40 --- /dev/null +++ b/storage/src/models/block_handle.rs @@ -0,0 +1,95 @@ +use std::sync::{Arc, Weak}; + +use anyhow::Result; +use everscale_types::models::*; +use tokio::sync::RwLock; + +use super::BlockMeta; +use tycho_util::FastDashMap; + +pub struct BlockHandle { + id: BlockId, + meta: BlockMeta, + block_data_lock: RwLock<()>, + proof_data_block: RwLock<()>, + cache: Arc>>, +} + +impl BlockHandle { + pub fn with_values( + id: BlockId, + meta: BlockMeta, + cache: Arc>>, + ) -> Self { + Self { + id, + meta, + block_data_lock: Default::default(), + proof_data_block: Default::default(), + cache, + } + } + + #[inline] + pub fn id(&self) -> &BlockId { + &self.id + } + + #[inline] + pub fn meta(&self) -> &BlockMeta { + &self.meta + } + + #[inline] + pub fn is_key_block(&self) -> bool { + self.meta.is_key_block() || self.id.seqno == 0 + } + + #[inline] + pub fn block_data_lock(&self) -> &RwLock<()> { + &self.block_data_lock + } + + #[inline] + pub fn proof_data_lock(&self) -> &RwLock<()> { + &self.proof_data_block + } + + pub fn has_proof_or_link(&self, is_link: &mut bool) -> bool { + *is_link = !self.id.shard.is_masterchain(); + if *is_link { + self.meta.has_proof_link() + } else { + self.meta.has_proof() + } + } + + pub fn masterchain_ref_seqno(&self) -> u32 { + if self.id.shard.is_masterchain() { + self.id.seqno + } else { + self.meta.masterchain_ref_seqno() + } + } + + pub fn set_masterchain_ref_seqno(&self, masterchain_ref_seqno: u32) -> Result { + match self.meta.set_masterchain_ref_seqno(masterchain_ref_seqno) { + 0 => Ok(true), + prev_seqno if prev_seqno == masterchain_ref_seqno => Ok(false), + _ => Err(BlockHandleError::RefSeqnoAlreadySet.into()), + } + } +} + +impl Drop for BlockHandle { + fn drop(&mut self) { + self.cache + .remove_if(&self.id, |_, weak| weak.strong_count() == 0); + } +} + +#[derive(thiserror::Error, Debug)] +enum BlockHandleError { + #[error("Different masterchain ref seqno has already been set")] + RefSeqnoAlreadySet, +} diff --git a/storage/src/models/block_meta.rs b/storage/src/models/block_meta.rs new file mode 100644 index 000000000..3581ab629 --- /dev/null +++ b/storage/src/models/block_meta.rs @@ -0,0 +1,277 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +use anyhow::Result; +use bytes::Buf; +use everscale_types::models::BlockInfo; + +use crate::util::{StoredValue, StoredValueBuffer}; + +#[derive(Debug, Copy, Clone)] +pub struct BlockMetaData { + pub is_key_block: bool, + pub gen_utime: u32, + pub mc_ref_seqno: Option, +} + +impl BlockMetaData { + pub fn zero_state(gen_utime: u32) -> Self { + Self { + is_key_block: true, + gen_utime, + mc_ref_seqno: Some(0), + } + } +} + +#[derive(Debug, Copy, Clone)] +pub struct BriefBlockInfo { + pub is_key_block: bool, + pub gen_utime: u32, + pub after_split: bool, +} + +impl From<&BlockInfo> for BriefBlockInfo { + fn from(info: &BlockInfo) -> Self { + Self { + is_key_block: info.key_block, + gen_utime: info.gen_utime, + after_split: info.after_split, + } + } +} +#[derive(Debug, Default)] +pub struct BlockMeta { + flags: AtomicU64, + gen_utime: u32, +} + +impl BlockMeta { + pub fn with_data(data: BlockMetaData) -> Self { + Self { + flags: AtomicU64::new( + if data.is_key_block { + BLOCK_META_FLAG_IS_KEY_BLOCK + } else { + 0 + } | data.mc_ref_seqno.unwrap_or_default() as u64, + ), + gen_utime: data.gen_utime, + } + } + + pub fn brief(&self) -> BriefBlockMeta { + BriefBlockMeta { + flags: self.flags.load(Ordering::Acquire), + gen_utime: self.gen_utime, + } + } + + pub fn masterchain_ref_seqno(&self) -> u32 { + self.flags.load(Ordering::Acquire) as u32 + } + + pub fn set_masterchain_ref_seqno(&self, seqno: u32) -> u32 { + self.flags.fetch_or(seqno as u64, Ordering::Release) as u32 + } + + #[inline] + pub fn gen_utime(&self) -> u32 { + self.gen_utime + } + + pub fn clear_data_and_proof(&self) { + self.flags.fetch_and(CLEAR_DATA_MASK, Ordering::Release); + } + + pub fn set_has_data(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_DATA) + } + + pub fn has_data(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_DATA) + } + + pub fn set_has_proof(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_PROOF) + } + + pub fn has_proof(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_PROOF) + } + + pub fn set_has_proof_link(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_PROOF_LINK) + } + + pub fn has_proof_link(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_PROOF_LINK) + } + + pub fn set_has_state(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_STATE) + } + + pub fn has_state(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_STATE) + } + + #[allow(unused)] + pub fn set_has_persistent_state(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_PERSISTENT_STATE) + } + + #[allow(unused)] + pub fn has_persistent_state(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_PERSISTENT_STATE) + } + + pub fn set_has_next1(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_NEXT_1) + } + + pub fn has_next1(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_NEXT_1) + } + + pub fn set_has_next2(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_NEXT_2) + } + + pub fn has_next2(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_NEXT_2) + } + + pub fn set_has_prev1(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_PREV_1) + } + + pub fn has_prev1(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_PREV_1) + } + + pub fn set_has_prev2(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_HAS_PREV_2) + } + + pub fn has_prev2(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_HAS_PREV_2) + } + + pub fn set_is_applied(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_IS_APPLIED) + } + + pub fn is_applied(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_IS_APPLIED) + } + + pub fn is_key_block(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_IS_KEY_BLOCK) + } + + pub fn set_is_moving_to_archive(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_MOVING_TO_ARCHIVE) + } + + pub fn set_is_archived(&self) -> bool { + self.set_flag(BLOCK_META_FLAG_MOVED_TO_ARCHIVE) + } + + pub fn is_archived(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_MOVED_TO_ARCHIVE) + } + + fn test_flag(&self, flag: u64) -> bool { + self.flags.load(Ordering::Acquire) & flag == flag + } + + fn set_flag(&self, flag: u64) -> bool { + self.flags.fetch_or(flag, Ordering::Release) & flag != flag + } +} + +impl StoredValue for BlockMeta { + /// 8 bytes flags + /// 4 bytes `gen_utime` + const SIZE_HINT: usize = 8 + 4; + + type OnStackSlice = [u8; Self::SIZE_HINT]; + + fn serialize(&self, buffer: &mut T) { + const FLAGS_MASK: u64 = 0x0000_ffff_ffff_ffff; + let flags = self.flags.load(Ordering::Acquire) & FLAGS_MASK; + + buffer.write_raw_slice(&flags.to_le_bytes()); + buffer.write_raw_slice(&self.gen_utime.to_le_bytes()); + } + + fn deserialize(reader: &mut &[u8]) -> Result + where + Self: Sized, + { + let flags = reader.get_u64_le(); + let gen_utime = reader.get_u32_le(); + + Ok(Self { + flags: AtomicU64::new(flags), + gen_utime, + }) + } +} + +#[derive(Debug, Default, Copy, Clone)] +pub struct BriefBlockMeta { + flags: u64, + gen_utime: u32, +} + +impl BriefBlockMeta { + #[inline] + pub fn gen_utime(&self) -> u32 { + self.gen_utime + } + + #[inline] + pub fn masterchain_ref_seqno(&self) -> u32 { + self.flags as u32 + } + + #[inline] + pub fn is_key_block(&self) -> bool { + self.test_flag(BLOCK_META_FLAG_IS_KEY_BLOCK) + } + + #[inline] + fn test_flag(&self, flag: u64) -> bool { + self.flags & flag == flag + } +} + +const BLOCK_META_FLAG_HAS_DATA: u64 = 1 << 32; +const BLOCK_META_FLAG_HAS_PROOF: u64 = 1 << (32 + 1); +const BLOCK_META_FLAG_HAS_PROOF_LINK: u64 = 1 << (32 + 2); +// skip flag 3 (processed by external listener) +const BLOCK_META_FLAG_HAS_STATE: u64 = 1 << (32 + 4); +const BLOCK_META_FLAG_HAS_PERSISTENT_STATE: u64 = 1 << (32 + 5); +const BLOCK_META_FLAG_HAS_NEXT_1: u64 = 1 << (32 + 6); +const BLOCK_META_FLAG_HAS_NEXT_2: u64 = 1 << (32 + 7); +const BLOCK_META_FLAG_HAS_PREV_1: u64 = 1 << (32 + 8); +const BLOCK_META_FLAG_HAS_PREV_2: u64 = 1 << (32 + 9); +const BLOCK_META_FLAG_IS_APPLIED: u64 = 1 << (32 + 10); +const BLOCK_META_FLAG_IS_KEY_BLOCK: u64 = 1 << (32 + 11); + +const BLOCK_META_FLAG_MOVING_TO_ARCHIVE: u64 = 1 << (32 + 12); +const BLOCK_META_FLAG_MOVED_TO_ARCHIVE: u64 = 1 << (32 + 13); + +const CLEAR_DATA_MASK: u64 = + !(BLOCK_META_FLAG_HAS_DATA | BLOCK_META_FLAG_HAS_PROOF | BLOCK_META_FLAG_HAS_PROOF_LINK); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + pub fn fully_on_stack() { + assert!(!BlockMeta::default().to_vec().spilled()); + } +} diff --git a/storage/src/models/mod.rs b/storage/src/models/mod.rs new file mode 100644 index 000000000..b106684ee --- /dev/null +++ b/storage/src/models/mod.rs @@ -0,0 +1,5 @@ +pub use block_handle::BlockHandle; +pub use block_meta::{BlockMeta, BlockMetaData, BriefBlockMeta}; + +mod block_handle; +mod block_meta; diff --git a/storage/src/store/block/mod.rs b/storage/src/store/block/mod.rs new file mode 100644 index 000000000..1e1b45efd --- /dev/null +++ b/storage/src/store/block/mod.rs @@ -0,0 +1,824 @@ +use std::borrow::Borrow; +use std::collections::BTreeSet; +use std::convert::TryInto; +use std::hash::Hash; +use std::ops::{Bound, RangeBounds}; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use everscale_types::models::*; +use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; +use tycho_block_util::archive::{ + make_archive_entry, ArchiveEntryId, ArchiveReaderError, ArchiveVerifier, GetFileName, +}; +use tycho_block_util::block::{ + BlockProofStuff, BlockProofStuffAug, BlockStuff, BlockStuffAug, TopBlocks, +}; + +use crate::db::*; +use crate::util::*; +use crate::{models::*, BlockHandleStorage, HandleCreationStatus}; + +pub struct BlockStorage { + db: Arc, + block_handle_storage: Arc, + archive_ids: RwLock>, +} + +impl BlockStorage { + pub fn new(db: Arc, block_handle_storage: Arc) -> Result { + let manager = Self { + db, + block_handle_storage, + archive_ids: Default::default(), + }; + + manager.preload()?; + + Ok(manager) + } + + fn preload(&self) -> Result<()> { + fn check_archive(value: &[u8]) -> Result<(), ArchiveReaderError> { + let mut verifier = ArchiveVerifier::default(); + verifier.write_verify(value)?; + verifier.final_check() + } + + let mut iter = self.db.archives.raw_iterator(); + iter.seek_to_first(); + + let mut archive_ids = self.archive_ids.write(); + + while let (Some(key), value) = (iter.key(), iter.value()) { + let archive_id = u32::from_be_bytes( + key.try_into() + .with_context(|| format!("Invalid archive key: {}", hex::encode(key)))?, + ); + + if let Some(Err(e)) = value.map(check_archive) { + tracing::error!(archive_id, "failed to read archive: {e:?}"); + } + + archive_ids.insert(archive_id); + iter.next(); + } + + tracing::info!("selfcheck complete"); + Ok(()) + } + + pub async fn store_block_data( + &self, + block: &BlockStuffAug, + meta_data: BlockMetaData, + ) -> Result { + let block_id = block.id(); + let (handle, status) = self + .block_handle_storage + .create_or_load_handle(block_id, meta_data)?; + + let archive_id = ArchiveEntryId::Block(block_id); + let mut updated = false; + if !handle.meta().has_data() { + let data = block.new_archive_data()?; + + let _lock = handle.block_data_lock().write().await; + if !handle.meta().has_data() { + self.add_data(&archive_id, data)?; + if handle.meta().set_has_data() { + self.block_handle_storage.store_handle(&handle)?; + updated = true; + } + } + } + + Ok(StoreBlockResult { + handle, + updated, + new: status == HandleCreationStatus::Created, + }) + } + + pub async fn load_block_data(&self, handle: &BlockHandle) -> Result { + let raw_block = self.load_block_data_raw_ref(handle).await?; + BlockStuff::deserialize(*handle.id(), raw_block.as_ref()) + } + + pub async fn load_block_data_raw(&self, handle: &BlockHandle) -> Result> { + if !handle.meta().has_data() { + return Err(BlockStorageError::BlockDataNotFound.into()); + } + self.get_data(handle, &ArchiveEntryId::Block(handle.id())) + .await + } + + pub async fn load_block_data_raw_ref<'a>( + &'a self, + handle: &'a BlockHandle, + ) -> Result + 'a> { + if !handle.meta().has_data() { + return Err(BlockStorageError::BlockDataNotFound.into()); + } + self.get_data_ref(handle, &ArchiveEntryId::Block(handle.id())) + .await + } + + pub async fn store_block_proof( + &self, + proof: &BlockProofStuffAug, + handle: BlockProofHandle, + ) -> Result { + let block_id = proof.id(); + if matches!(&handle, BlockProofHandle::Existing(handle) if handle.id() != block_id) { + return Err(BlockStorageError::BlockHandleIdMismatch.into()); + } + + let (handle, status) = match handle { + BlockProofHandle::Existing(handle) => (handle, HandleCreationStatus::Fetched), + BlockProofHandle::New(meta_data) => self + .block_handle_storage + .create_or_load_handle(block_id, meta_data)?, + }; + + let mut updated = false; + if proof.is_link() { + let archive_id = ArchiveEntryId::ProofLink(block_id); + if !handle.meta().has_proof_link() { + let data = proof.new_archive_data()?; + + let _lock = handle.proof_data_lock().write().await; + if !handle.meta().has_proof_link() { + self.add_data(&archive_id, data)?; + if handle.meta().set_has_proof_link() { + self.block_handle_storage.store_handle(&handle)?; + updated = true; + } + } + } + } else { + let archive_id = ArchiveEntryId::Proof(block_id); + if !handle.meta().has_proof() { + let data = proof.new_archive_data()?; + + let _lock = handle.proof_data_lock().write().await; + if !handle.meta().has_proof() { + self.add_data(&archive_id, data)?; + if handle.meta().set_has_proof() { + self.block_handle_storage.store_handle(&handle)?; + updated = true; + } + } + } + } + + Ok(StoreBlockResult { + handle, + updated, + new: status == HandleCreationStatus::Created, + }) + } + + pub async fn load_block_proof( + &self, + handle: &BlockHandle, + is_link: bool, + ) -> Result { + let raw_proof = self.load_block_proof_raw_ref(handle, is_link).await?; + BlockProofStuff::deserialize(*handle.id(), raw_proof.as_ref(), is_link) + } + + pub async fn load_block_proof_raw( + &self, + handle: &BlockHandle, + is_link: bool, + ) -> Result> { + let (archive_id, exists) = if is_link { + ( + ArchiveEntryId::ProofLink(handle.id()), + handle.meta().has_proof_link(), + ) + } else { + ( + ArchiveEntryId::Proof(handle.id()), + handle.meta().has_proof(), + ) + }; + + if !exists { + return Err(BlockStorageError::BlockProofNotFound.into()); + } + + self.get_data(handle, &archive_id).await + } + + pub async fn load_block_proof_raw_ref<'a>( + &'a self, + handle: &'a BlockHandle, + is_link: bool, + ) -> Result + 'a> { + let (archive_id, exists) = if is_link { + ( + ArchiveEntryId::ProofLink(handle.id()), + handle.meta().has_proof_link(), + ) + } else { + ( + ArchiveEntryId::Proof(handle.id()), + handle.meta().has_proof(), + ) + }; + + if !exists { + return Err(BlockStorageError::BlockProofNotFound.into()); + } + + self.get_data_ref(handle, &archive_id).await + } + + pub async fn move_into_archive(&self, handle: &BlockHandle) -> Result<()> { + if handle.meta().is_archived() { + return Ok(()); + } + if !handle.meta().set_is_moving_to_archive() { + return Ok(()); + } + + // Prepare data + let block_id = handle.id(); + + let has_data = handle.meta().has_data(); + let mut is_link = false; + let has_proof = handle.has_proof_or_link(&mut is_link); + + let block_data = if has_data { + let lock = handle.block_data_lock().write().await; + + let entry_id = ArchiveEntryId::Block(block_id); + let data = self.make_archive_segment(&entry_id)?; + + Some((lock, data)) + } else { + None + }; + + let block_proof_data = if has_proof { + let lock = handle.proof_data_lock().write().await; + + let entry_id = if is_link { + ArchiveEntryId::ProofLink(block_id) + } else { + ArchiveEntryId::Proof(block_id) + }; + let data = self.make_archive_segment(&entry_id)?; + + Some((lock, data)) + } else { + None + }; + + // Prepare cf + let storage_cf = self.db.archives.cf(); + let handle_cf = self.db.block_handles.cf(); + + // Prepare archive + let archive_id = self.compute_archive_id(handle); + let archive_id_bytes = archive_id.to_be_bytes(); + + // 0. Create transaction + let mut batch = rocksdb::WriteBatch::default(); + // 1. Append archive segment with block data + if let Some((_, data)) = &block_data { + batch.merge_cf(&storage_cf, archive_id_bytes, data); + } + // 2. Append archive segment with block proof data + if let Some((_, data)) = &block_proof_data { + batch.merge_cf(&storage_cf, archive_id_bytes, data); + } + // 3. Update block handle meta + if handle.meta().set_is_archived() { + batch.put_cf( + &handle_cf, + block_id.root_hash.as_slice(), + handle.meta().to_vec(), + ); + } + // 5. Execute transaction + self.db.raw().write(batch)?; + + // Block will be removed after blocks gc + + // Done + Ok(()) + } + + pub fn move_into_archive_with_data( + &self, + handle: &BlockHandle, + is_link: bool, + block_data: &[u8], + block_proof_data: &[u8], + ) -> Result<()> { + if handle.meta().is_archived() { + return Ok(()); + } + if !handle.meta().set_is_moving_to_archive() { + return Ok(()); + } + + let block_id = handle.id(); + + // Prepare cf + let archives_cf = self.db.archives.cf(); + let block_handles_cf = self.db.block_handles.cf(); + + // Prepare archive + let archive_id = self.compute_archive_id(handle); + let archive_id_bytes = archive_id.to_be_bytes(); + + let mut batch = rocksdb::WriteBatch::default(); + + batch.merge_cf( + &archives_cf, + archive_id_bytes, + make_archive_entry(&ArchiveEntryId::Block(handle.id()).filename(), block_data), + ); + + batch.merge_cf( + &archives_cf, + archive_id_bytes, + make_archive_entry( + &if is_link { + ArchiveEntryId::ProofLink(block_id) + } else { + ArchiveEntryId::Proof(block_id) + } + .filename(), + block_proof_data, + ), + ); + + if handle.meta().set_is_archived() { + batch.put_cf( + &block_handles_cf, + block_id.root_hash.as_slice(), + handle.meta().to_vec(), + ); + } + + self.db.raw().write(batch)?; + + Ok(()) + } + + pub fn get_archive_id(&self, mc_seqno: u32) -> Option { + match self.archive_ids.read().range(..=mc_seqno).next_back() { + // NOTE: handles case when mc_seqno is far in the future. + // However if there is a key block between `id` and `mc_seqno`, + // this will return an archive without that specified block. + Some(id) if mc_seqno < id + ARCHIVE_PACKAGE_SIZE => Some(*id), + _ => None, + } + } + + #[allow(unused)] + pub fn get_archives( + &self, + range: impl RangeBounds + 'static, + ) -> impl Iterator)> + '_ { + struct ArchivesIterator<'a> { + first: bool, + ids: (Bound, Bound), + iter: rocksdb::DBRawIterator<'a>, + } + + impl<'a> Iterator for ArchivesIterator<'a> { + type Item = (u32, Vec); + + fn next(&mut self) -> Option { + if self.first { + match self.ids.0 { + Bound::Included(id) => { + self.iter.seek(id.to_be_bytes()); + } + Bound::Excluded(id) => { + self.iter.seek((id + 1).to_be_bytes()); + } + Bound::Unbounded => { + self.iter.seek_to_first(); + } + } + self.first = false; + } else { + self.iter.next(); + } + + match (self.iter.key(), self.iter.value()) { + (Some(key), Some(value)) => { + let id = u32::from_be_bytes(key.try_into().unwrap_or_default()); + match self.ids.1 { + Bound::Included(bound_id) if id > bound_id => None, + Bound::Excluded(bound_id) if id >= bound_id => None, + _ => Some((id, value.to_vec())), + } + } + _ => None, + } + } + } + + ArchivesIterator { + first: true, + ids: (range.start_bound().cloned(), range.end_bound().cloned()), + iter: self.db.archives.raw_iterator(), + } + } + + pub fn get_archive_slice( + &self, + id: u32, + offset: usize, + limit: usize, + ) -> Result>> { + match self.db.archives.get(id.to_be_bytes())? { + Some(slice) if offset < slice.len() => { + let end = std::cmp::min(offset.saturating_add(limit), slice.len()); + Ok(Some(slice[offset..end].to_vec())) + } + Some(_) => Err(BlockStorageError::InvalidOffset.into()), + None => Ok(None), + } + } + + pub async fn remove_outdated_blocks( + &self, + key_block_id: &BlockId, + max_blocks_per_batch: Option, + gc_type: BlocksGcKind, + ) -> Result<()> { + let _compaction_guard = self.db.delay_compaction().await; + + // Find target block + let target_block = match gc_type { + BlocksGcKind::BeforePreviousKeyBlock => self + .block_handle_storage + .find_prev_key_block(key_block_id.seqno)?, + BlocksGcKind::BeforePreviousPersistentState => self + .block_handle_storage + .find_prev_persistent_key_block(key_block_id.seqno)?, + }; + + // Load target block data + let top_blocks = match target_block { + Some(handle) if handle.meta().has_data() => { + tracing::info!( + %key_block_id, + target_block_id = %handle.id(), + "starting blocks GC", + ); + self.load_block_data(&handle) + .await + .context("Failed to load target key block data") + .and_then(|block_data| TopBlocks::from_mc_block(&block_data)) + .context("Failed to compute top blocks for target block")? + } + _ => { + tracing::info!(%key_block_id, "blocks GC skipped"); + return Ok(()); + } + }; + + // Remove all expired entries + let total_cached_handles_removed = self.block_handle_storage.gc_handles_cache(&top_blocks); + + let db = self.db.clone(); + let BlockGcStats { + mc_package_entries_removed, + total_package_entries_removed, + total_handles_removed, + } = tokio::task::spawn_blocking(move || { + remove_blocks(db, max_blocks_per_batch, &top_blocks) + }) + .await??; + + tracing::info!( + %key_block_id, + total_cached_handles_removed, + mc_package_entries_removed, + total_package_entries_removed, + total_handles_removed, + "finished blocks GC" + ); + + // Done + Ok(()) + } + + pub async fn remove_outdated_archives(&self, until_id: u32) -> Result<()> { + let _compaction_guard = self.db.delay_compaction().await; + + let mut archive_ids = self.archive_ids.write(); + + let retained_ids = match archive_ids.iter().rev().find(|&id| *id < until_id).cloned() { + // Splits `archive_ids` into two parts - [..until_id] and [until_id..] + // `archive_ids` will now contain [..until_id] + Some(until_id) => archive_ids.split_off(&until_id), + None => { + tracing::info!("archives GC: nothing to remove"); + return Ok(()); + } + }; + // so we must swap maps to retain [until_id..] and get ids to remove + let removed_ids = std::mem::replace(&mut *archive_ids, retained_ids); + + // Print removed range bounds and compute real `until_id` + let until_id = match (removed_ids.first(), removed_ids.last()) { + (Some(first), Some(last)) => { + let len = removed_ids.len(); + tracing::info!( + archive_count = len, + first, + last, + "archives GC: removing archives" + ); + + match archive_ids.first() { + Some(until_id) => *until_id, + None => *last + 1, + } + } + _ => { + tracing::info!("archives GC: nothing to remove"); + return Ok(()); + } + }; + + // Remove archives + let archives_cf = self.db.archives.cf(); + let write_options = self.db.archives.write_config(); + + self.db.raw().delete_range_cf_opt( + &archives_cf, + [0; 4], + until_id.to_be_bytes(), + write_options, + )?; + + tracing::info!("archives GC: done"); + Ok(()) + } + + fn add_data(&self, id: &ArchiveEntryId, data: &[u8]) -> Result<(), rocksdb::Error> + where + I: Borrow + Hash, + { + self.db.package_entries.insert(id.to_vec(), data) + } + + #[allow(dead_code)] + fn has_data(&self, id: &ArchiveEntryId) -> Result + where + I: Borrow + Hash, + { + self.db.package_entries.contains_key(id.to_vec()) + } + + async fn get_data(&self, handle: &BlockHandle, id: &ArchiveEntryId) -> Result> + where + I: Borrow + Hash, + { + let _lock = match &id { + ArchiveEntryId::Block(_) => handle.block_data_lock().read().await, + ArchiveEntryId::Proof(_) | ArchiveEntryId::ProofLink(_) => { + handle.proof_data_lock().read().await + } + }; + + match self.db.package_entries.get(id.to_vec())? { + Some(a) => Ok(a.to_vec()), + None => Err(BlockStorageError::InvalidBlockData.into()), + } + } + + async fn get_data_ref<'a, I>( + &'a self, + handle: &'a BlockHandle, + id: &ArchiveEntryId, + ) -> Result + 'a> + where + I: Borrow + Hash, + { + let lock = match id { + ArchiveEntryId::Block(_) => handle.block_data_lock().read().await, + ArchiveEntryId::Proof(_) | ArchiveEntryId::ProofLink(_) => { + handle.proof_data_lock().read().await + } + }; + + match self.db.package_entries.get(id.to_vec())? { + Some(data) => Ok(BlockContentsLock { _lock: lock, data }), + None => Err(BlockStorageError::InvalidBlockData.into()), + } + } + + fn compute_archive_id(&self, handle: &BlockHandle) -> u32 { + let mc_seqno = handle.masterchain_ref_seqno(); + + if handle.meta().is_key_block() { + self.archive_ids.write().insert(mc_seqno); + return mc_seqno; + } + + let mut archive_id = mc_seqno - mc_seqno % ARCHIVE_SLICE_SIZE; + + let prev_id = { + let latest_archives = self.archive_ids.read(); + latest_archives.range(..=mc_seqno).next_back().cloned() + }; + + if let Some(prev_id) = prev_id { + if archive_id < prev_id { + archive_id = prev_id; + } + } + + if mc_seqno.saturating_sub(archive_id) >= ARCHIVE_PACKAGE_SIZE { + self.archive_ids.write().insert(mc_seqno); + archive_id = mc_seqno; + } + + archive_id + } + + fn make_archive_segment(&self, entry_id: &ArchiveEntryId) -> Result> + where + I: Borrow + Hash, + { + match self.db.package_entries.get(entry_id.to_vec())? { + Some(data) => Ok(make_archive_entry(&entry_id.filename(), &data)), + None => Err(BlockStorageError::InvalidBlockData.into()), + } + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum BlocksGcKind { + BeforePreviousKeyBlock, + BeforePreviousPersistentState, +} + +#[derive(Clone)] +pub enum BlockProofHandle { + Existing(Arc), + New(BlockMetaData), +} + +impl From> for BlockProofHandle { + fn from(handle: Arc) -> Self { + Self::Existing(handle) + } +} + +impl From for BlockProofHandle { + fn from(meta_data: BlockMetaData) -> Self { + Self::New(meta_data) + } +} + +pub struct StoreBlockResult { + pub handle: Arc, + pub updated: bool, + pub new: bool, +} + +fn remove_blocks( + db: Arc, + max_blocks_per_batch: Option, + top_blocks: &TopBlocks, +) -> Result { + let mut stats = BlockGcStats::default(); + + let raw = db.raw().as_ref(); + let package_entries_cf = db.package_entries.cf(); + let block_handles_cf = db.block_handles.cf(); + let key_blocks_cf = db.key_blocks.cf(); + + // Create batch + let mut batch = rocksdb::WriteBatch::default(); + let mut batch_len = 0; + + let package_entries_readopts = db.package_entries.new_read_config(); + let key_blocks_readopts = db.key_blocks.new_read_config(); + + // Iterate all entries and find expired items + let mut blocks_iter = raw.raw_iterator_cf_opt(&package_entries_cf, package_entries_readopts); + blocks_iter.seek_to_first(); + + loop { + let key = match blocks_iter.key() { + Some(key) => key, + None => break blocks_iter.status()?, + }; + + // Read only prefix with shard ident and seqno + let BlockIdShort { shard, seqno } = + BlockIdShort::deserialize(&mut std::convert::identity(key))?; + + // Don't gc latest blocks + if top_blocks.contains_shard_seqno(&shard, seqno) { + blocks_iter.next(); + continue; + } + + // Additionally check whether this item is a key block + if seqno == 0 + || shard.is_masterchain() + && raw + .get_pinned_cf_opt(&key_blocks_cf, seqno.to_be_bytes(), &key_blocks_readopts)? + .is_some() + { + // Don't remove key blocks + blocks_iter.next(); + continue; + } + + // Add item to the batch + batch.delete_cf(&package_entries_cf, key); + stats.total_package_entries_removed += 1; + if shard.is_masterchain() { + stats.mc_package_entries_removed += 1; + } + + // Key structure: + // [workchain id, 4 bytes] + // [shard id, 8 bytes] + // [seqno, 4 bytes] + // [root hash, 32 bytes] <- + // .. + if key.len() >= 48 { + batch.delete_cf(&block_handles_cf, &key[16..48]); + stats.total_handles_removed += 1; + } + + batch_len += 1; + if matches!( + max_blocks_per_batch, + Some(max_blocks_per_batch) if batch_len >= max_blocks_per_batch + ) { + tracing::info!( + total_package_entries_removed = stats.total_package_entries_removed, + "applying intermediate batch", + ); + let batch = std::mem::take(&mut batch); + raw.write(batch)?; + batch_len = 0; + } + + blocks_iter.next(); + } + + if batch_len > 0 { + tracing::info!("applying final batch"); + raw.write(batch)?; + } + + // Done + Ok(stats) +} + +#[derive(Debug, Copy, Clone, Default)] +pub struct BlockGcStats { + pub mc_package_entries_removed: usize, + pub total_package_entries_removed: usize, + pub total_handles_removed: usize, +} + +struct BlockContentsLock<'a> { + _lock: tokio::sync::RwLockReadGuard<'a, ()>, + data: rocksdb::DBPinnableSlice<'a>, +} + +impl<'a> AsRef<[u8]> for BlockContentsLock<'a> { + fn as_ref(&self) -> &[u8] { + self.data.as_ref() + } +} + +pub const ARCHIVE_PACKAGE_SIZE: u32 = 100; +pub const ARCHIVE_SLICE_SIZE: u32 = 20_000; + +#[derive(thiserror::Error, Debug)] +enum BlockStorageError { + #[error("Block data not found")] + BlockDataNotFound, + #[error("Block proof not found")] + BlockProofNotFound, + #[error("Block handle id mismatch")] + BlockHandleIdMismatch, + #[error("Invalid block data")] + InvalidBlockData, + #[error("Offset is outside of the archive slice")] + InvalidOffset, +} diff --git a/storage/src/store/block_connection/mod.rs b/storage/src/store/block_connection/mod.rs new file mode 100644 index 000000000..17d0a139c --- /dev/null +++ b/storage/src/store/block_connection/mod.rs @@ -0,0 +1,141 @@ +use std::sync::Arc; + +use anyhow::Result; +use everscale_types::models::*; + +use crate::db::*; +use crate::models::*; +use crate::util::*; + +/// Stores relations between blocks +pub struct BlockConnectionStorage { + db: Arc, +} + +impl BlockConnectionStorage { + pub fn new(db: Arc) -> Self { + Self { db } + } + + pub fn store_connection( + &self, + handle: &BlockHandle, + direction: BlockConnection, + connected_block_id: &BlockId, + ) -> Result<()> { + // Use strange match because all columns have different types + let store = match direction { + BlockConnection::Prev1 => { + if handle.meta().has_prev1() { + return Ok(()); + } + store_block_connection_impl(&self.db.prev1, handle, connected_block_id)?; + handle.meta().set_has_prev1() + } + BlockConnection::Prev2 => { + if handle.meta().has_prev2() { + return Ok(()); + } + store_block_connection_impl(&self.db.prev2, handle, connected_block_id)?; + handle.meta().set_has_prev2() + } + BlockConnection::Next1 => { + if handle.meta().has_next1() { + return Ok(()); + } + store_block_connection_impl(&self.db.next1, handle, connected_block_id)?; + handle.meta().set_has_next1() + } + BlockConnection::Next2 => { + if handle.meta().has_next2() { + return Ok(()); + } + store_block_connection_impl(&self.db.next2, handle, connected_block_id)?; + handle.meta().set_has_next2() + } + }; + + if store { + let id = handle.id(); + + if handle.is_key_block() { + let mut write_batch = weedb::rocksdb::WriteBatch::default(); + + write_batch.put_cf( + &self.db.block_handles.cf(), + id.root_hash.as_slice(), + handle.meta().to_vec(), + ); + write_batch.put_cf( + &self.db.key_blocks.cf(), + id.seqno.to_be_bytes(), + id.to_vec(), + ); + + self.db.raw().write(write_batch)?; + } else { + self.db + .block_handles + .insert(id.root_hash.as_slice(), handle.meta().to_vec())?; + } + } + + Ok(()) + } + + pub fn load_connection( + &self, + block_id: &BlockId, + direction: BlockConnection, + ) -> Result { + match direction { + BlockConnection::Prev1 => load_block_connection_impl(&self.db.prev1, block_id), + BlockConnection::Prev2 => load_block_connection_impl(&self.db.prev2, block_id), + BlockConnection::Next1 => load_block_connection_impl(&self.db.next1, block_id), + BlockConnection::Next2 => load_block_connection_impl(&self.db.next2, block_id), + } + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum BlockConnection { + Prev1, + Prev2, + Next1, + Next2, +} + +#[inline] +fn store_block_connection_impl( + db: &Table, + handle: &BlockHandle, + block_id: &BlockId, +) -> Result<(), weedb::rocksdb::Error> +where + T: ColumnFamily, +{ + db.insert( + handle.id().root_hash.as_slice(), + write_block_id_le(block_id), + ) +} + +#[inline] +fn load_block_connection_impl(db: &Table, block_id: &BlockId) -> Result +where + T: ColumnFamily, +{ + match db.get(block_id.root_hash.as_slice())? { + Some(value) => read_block_id_le(value.as_ref()) + .ok_or_else(|| BlockConnectionStorageError::InvalidBlockId.into()), + None => Err(BlockConnectionStorageError::NotFound.into()), + } +} + +#[derive(Debug, thiserror::Error)] +enum BlockConnectionStorageError { + #[error("Invalid connection block id")] + InvalidBlockId, + #[error("Block connection not found")] + NotFound, +} diff --git a/storage/src/store/block_handle/mod.rs b/storage/src/store/block_handle/mod.rs new file mode 100644 index 000000000..15341fed9 --- /dev/null +++ b/storage/src/store/block_handle/mod.rs @@ -0,0 +1,312 @@ +use std::sync::{Arc, Weak}; + +use anyhow::Result; +use everscale_types::models::BlockId; +use tycho_block_util::block::TopBlocks; +use tycho_block_util::state::is_persistent_state; +use tycho_util::FastDashMap; + +use crate::db::*; +use crate::models::*; +use crate::util::*; + +pub struct BlockHandleStorage { + db: Arc, + cache: Arc>>, +} + +impl BlockHandleStorage { + pub fn new(db: Arc) -> Self { + Self { + db, + cache: Arc::new(Default::default()), + } + } + + pub fn store_block_applied(&self, handle: &Arc) -> Result { + if handle.meta().set_is_applied() { + self.store_handle(handle)?; + Ok(true) + } else { + Ok(false) + } + } + + pub fn assign_mc_ref_seqno(&self, handle: &Arc, mc_ref_seqno: u32) -> Result<()> { + if handle.set_masterchain_ref_seqno(mc_ref_seqno)? { + self.store_handle(handle)?; + } + Ok(()) + } + + pub fn create_or_load_handle( + &self, + block_id: &BlockId, + meta_data: BlockMetaData, + ) -> Result<(Arc, HandleCreationStatus)> { + if let Some(handle) = self.load_handle(block_id)? { + return Ok((handle, HandleCreationStatus::Fetched)); + } + + if let Some(handle) = self.create_handle(*block_id, BlockMeta::with_data(meta_data))? { + return Ok((handle, HandleCreationStatus::Created)); + } + + if let Some(handle) = self.load_handle(block_id)? { + return Ok((handle, HandleCreationStatus::Fetched)); + } + + Err(BlockHandleStorageError::FailedToCreateBlockHandle.into()) + } + + pub fn load_handle(&self, block_id: &BlockId) -> Result>> { + Ok(loop { + if let Some(weak) = self.cache.get(block_id) { + if let Some(handle) = weak.upgrade() { + break Some(handle); + } + } + + if let Some(meta) = self.db.block_handles.get(block_id.root_hash.as_slice())? { + let meta = BlockMeta::from_slice(meta.as_ref())?; + if let Some(handle) = self.create_handle(*block_id, meta)? { + break Some(handle); + } + } else { + break None; + } + }) + } + + pub fn store_handle(&self, handle: &BlockHandle) -> Result<()> { + let id = handle.id(); + + self.db + .block_handles + .insert(id.root_hash.as_slice(), handle.meta().to_vec())?; + + if handle.is_key_block() { + self.db + .key_blocks + .insert(id.seqno.to_be_bytes(), id.to_vec())?; + } + + Ok(()) + } + + pub fn load_key_block_handle(&self, seqno: u32) -> Result> { + let key_block_id = self + .db + .key_blocks + .get(seqno.to_be_bytes())? + .map(|value| BlockId::from_slice(value.as_ref())) + .transpose()? + .ok_or(BlockHandleStorageError::KeyBlockNotFound)?; + + self.load_handle(&key_block_id)?.ok_or_else(|| { + BlockHandleStorageError::KeyBlockHandleNotFound(key_block_id.seqno).into() + }) + } + + pub fn find_last_key_block(&self) -> Result> { + let mut iter = self.db.key_blocks.raw_iterator(); + iter.seek_to_last(); + + // Load key block from current iterator value + let key_block_id = iter + .value() + .map(BlockId::from_slice) + .transpose()? + .ok_or(BlockHandleStorageError::KeyBlockNotFound)?; + + self.load_handle(&key_block_id)?.ok_or_else(|| { + BlockHandleStorageError::KeyBlockHandleNotFound(key_block_id.seqno).into() + }) + } + + pub fn find_prev_key_block(&self, seqno: u32) -> Result>> { + if seqno == 0 { + return Ok(None); + } + + // Create iterator and move it to the previous key block before the specified + let mut iter = self.db.key_blocks.raw_iterator(); + iter.seek_for_prev((seqno - 1u32).to_be_bytes()); + + // Load key block from current iterator value + iter.value() + .map(BlockId::from_slice) + .transpose()? + .map(|key_block_id| { + self.load_handle(&key_block_id)?.ok_or_else(|| { + BlockHandleStorageError::KeyBlockHandleNotFound(key_block_id.seqno).into() + }) + }) + .transpose() + } + + pub fn find_prev_persistent_key_block(&self, seqno: u32) -> Result>> { + if seqno == 0 { + return Ok(None); + } + + // Create iterator and move it to the previous key block before the specified + let mut iter = self.db.key_blocks.raw_iterator(); + iter.seek_for_prev((seqno - 1u32).to_be_bytes()); + + // Loads key block from current iterator value and moves it backward + let mut get_key_block = move || -> Result>> { + // Load key block id + let key_block_id = match iter.value().map(BlockId::from_slice).transpose()? { + Some(prev_key_block) => prev_key_block, + None => return Ok(None), + }; + + // Load block handle for this id + let handle = self.load_handle(&key_block_id)?.ok_or( + BlockHandleStorageError::KeyBlockHandleNotFound(key_block_id.seqno), + )?; + + // Move iterator backward + iter.prev(); + + // Done + Ok(Some(handle)) + }; + + // Load previous key block + let mut key_block = match get_key_block()? { + Some(id) => id, + None => return Ok(None), + }; + + // Load previous key blocks and check if the `key_block` is for persistent state + while let Some(prev_key_block) = get_key_block()? { + if is_persistent_state( + key_block.meta().gen_utime(), + prev_key_block.meta().gen_utime(), + ) { + // Found + return Ok(Some(key_block)); + } + key_block = prev_key_block; + } + + // Not found + Ok(None) + } + + pub fn key_blocks_iterator( + &self, + direction: KeyBlocksDirection, + ) -> impl Iterator> + '_ { + let mut raw_iterator = self.db.key_blocks.raw_iterator(); + let reverse = match direction { + KeyBlocksDirection::ForwardFrom(seqno) => { + raw_iterator.seek(seqno.to_be_bytes()); + false + } + KeyBlocksDirection::Backward => { + raw_iterator.seek_to_last(); + true + } + }; + + KeyBlocksIterator { + raw_iterator, + reverse, + } + } + + pub fn gc_handles_cache(&self, top_blocks: &TopBlocks) -> usize { + let mut total_removed = 0; + + self.cache.retain(|block_id, value| { + let value = match value.upgrade() { + Some(value) => value, + None => { + total_removed += 1; + return false; + } + }; + + if block_id.seqno == 0 + || block_id.is_masterchain() && value.is_key_block() + || top_blocks.contains(block_id) + { + // Keep zero state, key blocks and latest blocks + true + } else { + // Remove all outdated + total_removed += 1; + value.meta().clear_data_and_proof(); + false + } + }); + + total_removed + } + + fn create_handle( + &self, + block_id: BlockId, + meta: BlockMeta, + ) -> Result>> { + use dashmap::mapref::entry::Entry; + + let handle = match self.cache.entry(block_id) { + Entry::Vacant(entry) => { + let handle = Arc::new(BlockHandle::with_values(block_id, meta, self.cache.clone())); + entry.insert(Arc::downgrade(&handle)); + handle + } + Entry::Occupied(_) => return Ok(None), + }; + + self.store_handle(&handle)?; + + Ok(Some(handle)) + } +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum HandleCreationStatus { + Created, + Fetched, +} + +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum KeyBlocksDirection { + ForwardFrom(u32), + Backward, +} + +struct KeyBlocksIterator<'a> { + raw_iterator: weedb::rocksdb::DBRawIterator<'a>, + reverse: bool, +} + +impl Iterator for KeyBlocksIterator<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + let value = self.raw_iterator.value().map(BlockId::from_slice)?; + if self.reverse { + self.raw_iterator.prev(); + } else { + self.raw_iterator.next(); + } + + Some(value) + } +} + +#[derive(thiserror::Error, Debug)] +enum BlockHandleStorageError { + #[error("Failed to create block handle")] + FailedToCreateBlockHandle, + #[error("Key block not found")] + KeyBlockNotFound, + #[error("Key block handle not found: {}", .0)] + KeyBlockHandleNotFound(u32), +} diff --git a/storage/src/store/mod.rs b/storage/src/store/mod.rs new file mode 100644 index 000000000..fdfacd2c0 --- /dev/null +++ b/storage/src/store/mod.rs @@ -0,0 +1,15 @@ +pub use self::block::*; +pub use self::block_connection::*; +pub use self::block_handle::*; +pub use self::node_state::*; +pub use self::persistent_state::*; +pub use self::runtime::*; +pub use self::shard_state::*; + +mod block; +mod block_connection; +mod block_handle; +mod node_state; +mod persistent_state; +mod runtime; +mod shard_state; diff --git a/storage/src/store/node_state/mod.rs b/storage/src/store/node_state/mod.rs new file mode 100644 index 000000000..d8e3f4fcb --- /dev/null +++ b/storage/src/store/node_state/mod.rs @@ -0,0 +1,137 @@ +use std::sync::Arc; + +use anyhow::Result; +use everscale_types::models::*; +use parking_lot::Mutex; + +use crate::db::*; +use crate::util::*; + +pub struct NodeStateStorage { + db: Arc, + last_mc_block_id: BlockIdCache, + init_mc_block_id: BlockIdCache, + shards_client_mc_block_id: BlockIdCache, +} + +impl NodeStateStorage { + pub fn new(db: Arc) -> Self { + Self { + db, + last_mc_block_id: (Default::default(), LAST_MC_BLOCK_ID), + init_mc_block_id: (Default::default(), INIT_MC_BLOCK_ID), + shards_client_mc_block_id: (Default::default(), SHARDS_CLIENT_MC_BLOCK_ID), + } + } + + pub fn store_historical_sync_start(&self, id: &BlockId) -> Result<()> { + let node_states = &self.db.node_states; + node_states.insert(HISTORICAL_SYNC_LOW, id.to_vec())?; + Ok(()) + } + + pub fn load_historical_sync_start(&self) -> Result> { + Ok(match self.db.node_states.get(HISTORICAL_SYNC_LOW)? { + Some(data) => Some(BlockId::from_slice(data.as_ref())?), + None => None, + }) + } + + pub fn store_historical_sync_end(&self, id: &BlockId) -> Result<()> { + let node_states = &self.db.node_states; + node_states.insert(HISTORICAL_SYNC_HIGH, id.to_vec())?; + Ok(()) + } + + pub fn load_historical_sync_end(&self) -> Result { + let node_states = &self.db.node_states; + let data = node_states + .get(HISTORICAL_SYNC_HIGH)? + .ok_or(NodeStateStorageError::HighBlockNotFound)?; + BlockId::from_slice(data.as_ref()) + } + + #[allow(unused)] + pub fn store_last_uploaded_archive(&self, archive_id: u32) -> Result<()> { + let node_states = &self.db.node_states; + node_states.insert(LAST_UPLOADED_ARCHIVE, archive_id.to_le_bytes())?; + Ok(()) + } + + #[allow(unused)] + pub fn load_last_uploaded_archive(&self) -> Result> { + Ok(match self.db.node_states.get(LAST_UPLOADED_ARCHIVE)? { + Some(data) if data.len() >= 4 => { + Some(u32::from_le_bytes(data[..4].try_into().unwrap())) + } + _ => None, + }) + } + + pub fn store_last_mc_block_id(&self, id: &BlockId) -> Result<()> { + self.store_block_id(&self.last_mc_block_id, id) + } + + pub fn load_last_mc_block_id(&self) -> Result { + self.load_block_id(&self.last_mc_block_id) + } + + pub fn store_init_mc_block_id(&self, id: &BlockId) -> Result<()> { + self.store_block_id(&self.init_mc_block_id, id) + } + + pub fn load_init_mc_block_id(&self) -> Result { + self.load_block_id(&self.init_mc_block_id) + } + + pub fn store_shards_client_mc_block_id(&self, id: &BlockId) -> Result<()> { + self.store_block_id(&self.shards_client_mc_block_id, id) + } + + pub fn load_shards_client_mc_block_id(&self) -> Result { + self.load_block_id(&self.shards_client_mc_block_id) + } + + #[inline(always)] + fn store_block_id(&self, (cache, key): &BlockIdCache, block_id: &BlockId) -> Result<()> { + let node_states = &self.db.node_states; + node_states.insert(key, write_block_id_le(block_id))?; + *cache.lock() = Some(*block_id); + Ok(()) + } + + #[inline(always)] + fn load_block_id(&self, (cache, key): &BlockIdCache) -> Result { + if let Some(cached) = &*cache.lock() { + return Ok(*cached); + } + + let value = match self.db.node_states.get(key)? { + Some(data) => read_block_id_le(&data).ok_or(NodeStateStorageError::InvalidBlockId)?, + None => return Err(NodeStateStorageError::ParamNotFound.into()), + }; + *cache.lock() = Some(value); + Ok(value) + } +} + +#[derive(thiserror::Error, Debug)] +pub enum NodeStateStorageError { + #[error("High block not found")] + HighBlockNotFound, + #[error("Not found")] + ParamNotFound, + #[error("Invalid block id")] + InvalidBlockId, +} + +type BlockIdCache = (Mutex>, &'static [u8]); + +const HISTORICAL_SYNC_LOW: &[u8] = b"background_sync_low"; +const HISTORICAL_SYNC_HIGH: &[u8] = b"background_sync_high"; + +const LAST_UPLOADED_ARCHIVE: &[u8] = b"last_uploaded_archive"; + +const LAST_MC_BLOCK_ID: &[u8] = b"LastMcBlockId"; +const INIT_MC_BLOCK_ID: &[u8] = b"InitMcBlockId"; +const SHARDS_CLIENT_MC_BLOCK_ID: &[u8] = b"ShardsClientMcBlockId"; diff --git a/storage/src/store/persistent_state/cell_writer.rs b/storage/src/store/persistent_state/cell_writer.rs new file mode 100644 index 000000000..bd3f6f3dd --- /dev/null +++ b/storage/src/store/persistent_state/cell_writer.rs @@ -0,0 +1,401 @@ +use std::collections::hash_map; +use std::io::{Read, Seek, SeekFrom, Write}; +use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use everscale_types::cell::{CellDescriptor, HashBytes}; +use smallvec::SmallVec; +use tycho_util::FastHashMap; + +use crate::db::{Db, FileDb, TempFile}; + +pub struct CellWriter<'a> { + db: &'a Db, + states_dir: &'a FileDb, + block_root_hash: &'a HashBytes, +} + +impl<'a> CellWriter<'a> { + #[allow(unused)] + pub fn new(db: &'a Db, states_dir: &'a FileDb, block_root_hash: &'a HashBytes) -> Self { + Self { + db, + states_dir, + block_root_hash, + } + } + + #[allow(unused)] + pub fn write(&self, root_hash: &[u8; 32], is_cancelled: Option>) -> Result<()> { + // Load cells from db in reverse order into the temp file + tracing::info!("started loading cells"); + let mut intermediate = self + .write_rev(root_hash, &is_cancelled) + .context("Failed to write reversed cells data")?; + tracing::info!("finished loading cells"); + let cell_count = intermediate.cell_sizes.len() as u32; + + // Compute offset type size (usually 4 bytes) + let offset_size = + std::cmp::min(number_of_bytes_to_fit(intermediate.total_size), 8) as usize; + + // Compute file size + let file_size = + 22 + offset_size * (1 + cell_count as usize) + (intermediate.total_size as usize); + + // Create states file + let mut file = self + .states_dir + .file(self.file_name()) + .create(true) + .write(true) + .truncate(true) + .prealloc(file_size) + .open()?; + + // Write cells data in BOC format + let mut buffer = std::io::BufWriter::with_capacity(FILE_BUFFER_LEN / 2, file); + + // Header | current len: 0 + let flags = 0b1000_0000u8 | (REF_SIZE as u8); + buffer.write_all(&[0xb5, 0xee, 0x9c, 0x72, flags, offset_size as u8])?; + + // Unique cell count | current len: 6 + buffer.write_all(&cell_count.to_be_bytes())?; + + // Root count | current len: 10 + buffer.write_all(&1u32.to_be_bytes())?; + + // Absent cell count | current len: 14 + buffer.write_all(&[0, 0, 0, 0])?; + + // Total cell size | current len: 18 + buffer.write_all(&intermediate.total_size.to_be_bytes()[(8 - offset_size)..8])?; + + // Root index | current len: 18 + offset_size + buffer.write_all(&[0, 0, 0, 0])?; + + // Cells index | current len: 22 + offset_size + tracing::info!("started building index"); + { + let mut next_offset = 0; + for &cell_size in intermediate.cell_sizes.iter().rev() { + next_offset += cell_size as u64; + buffer.write_all(&next_offset.to_be_bytes()[(8 - offset_size)..8])?; + } + } + tracing::info!("finished building index"); + + // Cells | current len: 22 + offset_size * (1 + cell_sizes.len()) + let mut cell_buffer = [0; 2 + 128 + 4 * REF_SIZE]; + for (i, &cell_size) in intermediate.cell_sizes.iter().rev().enumerate() { + if let Some(is_cancelled) = is_cancelled.as_ref() { + if i % 1000 == 0 && is_cancelled.load(Ordering::Relaxed) { + anyhow::bail!("Cell writing cancelled.") + } + } + + intermediate.total_size -= cell_size as u64; + intermediate + .file + .seek(SeekFrom::Start(intermediate.total_size))?; + intermediate + .file + .read_exact(&mut cell_buffer[..cell_size as usize])?; + + let descriptor = CellDescriptor { + d1: cell_buffer[0], + d2: cell_buffer[1], + }; + + let ref_offset = 2 + descriptor.byte_len() as usize; + for r in 0..descriptor.reference_count() as usize { + let ref_offset = ref_offset + r * REF_SIZE; + let slice = &mut cell_buffer[ref_offset..ref_offset + REF_SIZE]; + + let index = u32::from_be_bytes(slice.try_into().unwrap()); + slice.copy_from_slice(&(cell_count - index - 1).to_be_bytes()); + } + + buffer.write_all(&cell_buffer[..cell_size as usize])?; + } + + buffer.flush()?; + + Ok(()) + } + + pub fn remove(&self) -> Result<()> { + let file_name = self.file_name(); + self.states_dir.remove_file(&file_name).context(format!( + "Failed to remove persistent state file {}", + self.states_dir.path().join(file_name).display() + )) + } + + fn write_rev( + &self, + root_hash: &[u8; 32], + is_cancelled: &Option>, + ) -> Result { + enum StackItem { + New([u8; 32]), + Loaded(LoadedCell), + } + + struct LoadedCell { + hash: [u8; 32], + descriptor: CellDescriptor, + data: SmallVec<[u8; 128]>, + indices: SmallVec<[u32; 4]>, + } + + let mut file = self + .states_dir + .file(self.file_name().with_extension("temp")) + .create(true) + .write(true) + .read(true) + .truncate(true) + .open_as_temp()?; + + let raw = self.db.raw().as_ref(); + let read_options = self.db.cells.read_config(); + let cf = self.db.cells.cf(); + + let mut references_buffer = SmallVec::<[[u8; 32]; 4]>::with_capacity(4); + + let mut indices = FastHashMap::default(); + let mut remap = FastHashMap::default(); + let mut cell_sizes = Vec::::with_capacity(FILE_BUFFER_LEN); + let mut stack = Vec::with_capacity(32); + + let mut total_size = 0u64; + let mut iteration = 0u32; + let mut remap_index = 0u32; + + stack.push((iteration, StackItem::New(*root_hash))); + indices.insert(*root_hash, (iteration, false)); + + let mut temp_file_buffer = std::io::BufWriter::with_capacity(FILE_BUFFER_LEN, &mut *file); + + while let Some((index, data)) = stack.pop() { + if let Some(is_cancelled) = is_cancelled { + if iteration % 1000 == 0 && is_cancelled.load(Ordering::Relaxed) { + anyhow::bail!("Persistent state writing cancelled.") + } + } + + match data { + StackItem::New(hash) => { + let value = raw + .get_pinned_cf_opt(&cf, hash, read_options)? + .ok_or(CellWriterError::CellNotFound)?; + + let value = match crate::refcount::strip_refcount(value.as_ref()) { + Some(bytes) => bytes, + None => { + return Err(CellWriterError::CellNotFound.into()); + } + }; + if value.is_empty() { + return Err(CellWriterError::InvalidCell.into()); + } + + let (descriptor, data) = deserialize_cell(value, &mut references_buffer) + .ok_or(CellWriterError::InvalidCell)?; + + let mut reference_indices = SmallVec::with_capacity(references_buffer.len()); + + let mut indices_buffer = [0; 4]; + let mut keys = [std::ptr::null(); 4]; + let mut preload_count = 0; + + for hash in &references_buffer { + let index = match indices.entry(*hash) { + hash_map::Entry::Vacant(entry) => { + remap_index += 1; + + entry.insert((remap_index, false)); + + indices_buffer[preload_count] = remap_index; + keys[preload_count] = hash.as_ptr(); + preload_count += 1; + + remap_index + } + hash_map::Entry::Occupied(entry) => { + let (remap_index, written) = *entry.get(); + if !written { + indices_buffer[preload_count] = remap_index; + keys[preload_count] = hash.as_ptr(); + preload_count += 1; + } + remap_index + } + }; + + reference_indices.push(index); + } + + stack.push(( + index, + StackItem::Loaded(LoadedCell { + hash, + descriptor, + data: SmallVec::from_slice(data), + indices: reference_indices, + }), + )); + + if preload_count > 0 { + indices_buffer[..preload_count].reverse(); + keys[..preload_count].reverse(); + + for i in 0..preload_count { + let index = indices_buffer[i]; + let hash = unsafe { *keys[i].cast::<[u8; 32]>() }; + stack.push((index, StackItem::New(hash))); + } + } + + references_buffer.clear(); + } + StackItem::Loaded(loaded) => { + match remap.entry(index) { + hash_map::Entry::Vacant(entry) => { + entry.insert(iteration.to_be_bytes()); + } + hash_map::Entry::Occupied(_) => continue, + }; + + if let Some((_, written)) = indices.get_mut(&loaded.hash) { + *written = true; + } + + iteration += 1; + if iteration % 100000 == 0 { + tracing::info!(iteration); + } + + let cell_size = 2 + loaded.data.len() + loaded.indices.len() * REF_SIZE; + cell_sizes.push(cell_size as u8); + total_size += cell_size as u64; + + temp_file_buffer.write_all(&[loaded.descriptor.d1, loaded.descriptor.d2])?; + temp_file_buffer.write_all(&loaded.data)?; + for index in loaded.indices { + let index = remap.get(&index).with_context(|| { + format!("Child not found. Iteration {iteration}. Child {index}") + })?; + temp_file_buffer.write_all(index)?; + } + } + } + } + + drop(temp_file_buffer); + + file.flush()?; + + Ok(IntermediateState { + file, + cell_sizes, + total_size, + }) + } + + fn file_name(&self) -> PathBuf { + PathBuf::from(self.block_root_hash.to_string()) + } +} + +struct IntermediateState { + file: TempFile, + cell_sizes: Vec, + total_size: u64, +} + +fn deserialize_cell<'a>( + value: &'a [u8], + references_buffer: &mut SmallVec<[[u8; 32]; 4]>, +) -> Option<(CellDescriptor, &'a [u8])> { + let mut index = Index { + value_len: value.len(), + offset: 0, + }; + + index.require(4)?; + let mut descriptor = CellDescriptor::new([value[*index], value[*index + 1]]); + descriptor.d1 &= !CellDescriptor::STORE_HASHES_MASK; + + index.advance(2); + let bit_length = u16::from_le_bytes([value[*index], value[*index + 1]]); + index.advance(2); + + let data_len = descriptor.byte_len() as usize; + index.require(data_len)?; + let data = &value[*index..*index + data_len]; + index.advance(data_len); + + assert_eq!((bit_length as usize + 7) / 8, data_len); + + index.advance((32 + 2) * descriptor.hash_count() as usize); + + for _ in 0..descriptor.reference_count() { + index.require(32)?; + let mut hash = [0; 32]; + hash.copy_from_slice(&value[*index..*index + 32]); + references_buffer.push(hash); + index.advance(32); + } + + Some((descriptor, data)) +} + +fn number_of_bytes_to_fit(l: u64) -> u32 { + 8 - l.leading_zeros() / 8 +} + +struct Index { + value_len: usize, + offset: usize, +} + +impl Index { + #[inline(always)] + fn require(&self, len: usize) -> Option<()> { + if self.offset + len <= self.value_len { + Some(()) + } else { + None + } + } + + #[inline(always)] + fn advance(&mut self, bytes: usize) { + self.offset += bytes; + } +} + +impl std::ops::Deref for Index { + type Target = usize; + + #[inline(always)] + fn deref(&self) -> &Self::Target { + &self.offset + } +} + +const REF_SIZE: usize = std::mem::size_of::(); +const FILE_BUFFER_LEN: usize = 128 * 1024 * 1024; // 128 MB + +#[derive(thiserror::Error, Debug)] +enum CellWriterError { + #[error("Cell not found in cell db")] + CellNotFound, + #[error("Invalid cell")] + InvalidCell, +} diff --git a/storage/src/store/persistent_state/mod.rs b/storage/src/store/persistent_state/mod.rs new file mode 100644 index 000000000..da6858b2c --- /dev/null +++ b/storage/src/store/persistent_state/mod.rs @@ -0,0 +1,239 @@ +use std::io::{BufReader, Read, Seek, SeekFrom}; +use std::path::PathBuf; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use anyhow::Result; +use bytes::{Bytes, BytesMut}; +use everscale_types::cell::HashBytes; +use everscale_types::models::BlockId; +use tokio::time::Instant; + +use crate::db::Db; +use crate::store::BlockHandleStorage; +use crate::FileDb; + +mod cell_writer; + +const KEY_BLOCK_UTIME_STEP: u32 = 86400; +const BASE_DIR: &str = "states"; + +pub struct PersistentStateStorage { + db: Arc, + storage_dir: FileDb, + block_handle_storage: Arc, + is_cancelled: Arc, +} + +impl PersistentStateStorage { + pub fn new( + db: Arc, + files_dir: &FileDb, + block_handle_storage: Arc, + ) -> Result { + let storage_dir = files_dir.subdir(BASE_DIR); + storage_dir.ensure_exists()?; + + let is_cancelled = Arc::new(AtomicBool::new(false)); + + Ok(Self { + db, + storage_dir, + block_handle_storage, + is_cancelled, + }) + } + + pub async fn save_state( + &self, + mc_block_id: &BlockId, + block_id: &BlockId, + root_hash: &HashBytes, + ) -> Result<()> { + let block_id = *block_id; + let root_hash = *root_hash; + let is_cancelled = Some(self.is_cancelled.clone()); + + let db = self.db.clone(); + let states_dir = self.prepare_persistent_states_dir(mc_block_id)?; + + tokio::task::spawn_blocking(move || { + let cell_writer = cell_writer::CellWriter::new(&db, &states_dir, &block_id.root_hash); + match cell_writer.write(&root_hash.0, is_cancelled) { + Ok(()) => { + tracing::info!( + block_id = %block_id, + "successfully wrote persistent state to a file", + ); + } + Err(e) => { + tracing::error!( + block_id = %block_id, + "writing persistent state failed: {e:?}" + ); + + if let Err(e) = cell_writer.remove() { + tracing::error!(%block_id, "{e}"); + } + } + } + }) + .await + .map_err(From::from) + } + + pub async fn read_state_part( + &self, + mc_block_id: &BlockId, + block_id: &BlockId, + offset: u64, + size: u64, + ) -> Option { + let path = self + .mc_states_dir(mc_block_id) + .join(block_id.root_hash.to_string()); + + tokio::task::spawn_blocking(move || { + // TODO: cache file handles + let mut file = std::fs::OpenOptions::new().read(true).open(path).ok()?; + + if let Err(e) = file.seek(SeekFrom::Start(offset)) { + tracing::error!("failed to seek state file offset: {e:?}"); + return None; + } + + let mut buf_reader = BufReader::new(file); + + let mut result = BytesMut::zeroed(size as usize); + let mut result_cursor = 0; + + let now = Instant::now(); + loop { + match buf_reader.read(&mut result[result_cursor..]) { + Ok(bytes_read) => { + tracing::info!("Reading state file. Bytes read: {}", bytes_read); + if bytes_read == 0 || bytes_read == size as usize { + break; + } + result_cursor += bytes_read; + } + Err(e) => { + tracing::error!("Failed to read state file. Err: {e:?}"); + return None; + } + } + } + tracing::info!( + "Finished reading buffer after: {} ms", + now.elapsed().as_millis() + ); + + Some(result.freeze()) + }) + .await + .ok() + .flatten() + } + + pub fn state_exists(&self, mc_block_id: &BlockId, block_id: &BlockId) -> bool { + // TODO: cache file handles + self.mc_states_dir(mc_block_id) + .join(block_id.root_hash.to_string()) + .is_file() + } + + pub fn prepare_persistent_states_dir(&self, mc_block: &BlockId) -> Result { + let states_dir = self.storage_dir.subdir(mc_block.seqno.to_string()); + if !states_dir.path().is_dir() { + tracing::info!(mc_block = %mc_block, "creating persistent state directory"); + states_dir.ensure_exists()?; + } + Ok(states_dir) + } + + fn mc_states_dir(&self, mc_block_id: &BlockId) -> PathBuf { + self.storage_dir.path().join(mc_block_id.seqno.to_string()) + } + + pub fn cancel(&self) { + self.is_cancelled.store(true, Ordering::Release); + } + + pub async fn clear_old_persistent_states(&self) -> Result<()> { + tracing::info!("started clearing old persistent state directories"); + let start = Instant::now(); + + // Keep 2 days of states + 1 state before + let block = { + let now = tycho_util::time::now_sec(); + let mut key_block = self.block_handle_storage.find_last_key_block()?; + + loop { + match self + .block_handle_storage + .find_prev_persistent_key_block(key_block.id().seqno)? + { + Some(prev_key_block) => { + if prev_key_block.meta().gen_utime() + 2 * KEY_BLOCK_UTIME_STEP < now { + break prev_key_block; + } else { + key_block = prev_key_block; + } + } + None => return Ok(()), + } + } + }; + + self.clear_outdated_state_entries(block.id())?; + + tracing::info!( + elapsed = %humantime::format_duration(start.elapsed()), + "clearing old persistent state directories completed" + ); + + Ok(()) + } + + fn clear_outdated_state_entries(&self, recent_block_id: &BlockId) -> Result<()> { + let mut directories_to_remove: Vec = Vec::new(); + let mut files_to_remove: Vec = Vec::new(); + + for entry in self.storage_dir.entries()?.flatten() { + let path = entry.path(); + + if path.is_file() { + files_to_remove.push(path); + continue; + } + + let Ok(name) = entry.file_name().into_string() else { + directories_to_remove.push(path); + continue; + }; + + let is_recent = + matches!(name.parse::(), Ok(seqno) if seqno >= recent_block_id.seqno); + + if !is_recent { + directories_to_remove.push(path); + } + } + + for dir in directories_to_remove { + tracing::info!(dir = %dir.display(), "removing an old persistent state directory"); + if let Err(e) = std::fs::remove_dir_all(&dir) { + tracing::error!(dir = %dir.display(), "failed to remove an old persistent state: {e:?}"); + } + } + + for file in files_to_remove { + tracing::info!(file = %file.display(), "removing file"); + if let Err(e) = std::fs::remove_file(&file) { + tracing::error!(file = %file.display(), "failed to remove file: {e:?}"); + } + } + + Ok(()) + } +} diff --git a/storage/src/store/runtime/mod.rs b/storage/src/store/runtime/mod.rs new file mode 100644 index 000000000..54d9b89ad --- /dev/null +++ b/storage/src/store/runtime/mod.rs @@ -0,0 +1,24 @@ +use std::sync::Arc; + +pub use self::persistent_state_keeper::PersistentStateKeeper; + +use super::BlockHandleStorage; + +mod persistent_state_keeper; + +pub struct RuntimeStorage { + persistent_state_keeper: PersistentStateKeeper, +} + +impl RuntimeStorage { + pub fn new(block_handle_storage: Arc) -> Self { + Self { + persistent_state_keeper: PersistentStateKeeper::new(block_handle_storage), + } + } + + #[inline(always)] + pub fn persistent_state_keeper(&self) -> &PersistentStateKeeper { + &self.persistent_state_keeper + } +} diff --git a/storage/src/store/runtime/persistent_state_keeper.rs b/storage/src/store/runtime/persistent_state_keeper.rs new file mode 100644 index 000000000..453c8b0ac --- /dev/null +++ b/storage/src/store/runtime/persistent_state_keeper.rs @@ -0,0 +1,91 @@ +use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; +use std::sync::Arc; + +use anyhow::Result; +use arc_swap::ArcSwapOption; +use tokio::sync::Notify; + +use tycho_block_util::state::*; + +use crate::models::{BlockHandle, BriefBlockMeta}; +use crate::BlockHandleStorage; + +pub struct PersistentStateKeeper { + block_handle_storage: Arc, + initialized: AtomicBool, + persistent_state_changed: Notify, + current_persistent_state: ArcSwapOption, + last_utime: AtomicU32, +} + +impl PersistentStateKeeper { + pub fn new(block_handle_storage: Arc) -> Self { + Self { + block_handle_storage, + initialized: Default::default(), + persistent_state_changed: Default::default(), + current_persistent_state: Default::default(), + last_utime: Default::default(), + } + } + + pub fn update(&self, block_handle: &Arc) -> Result<()> { + println!("UPDATE"); + + if !self.initialized.load(Ordering::Acquire) { + let prev_persistent_key_block = self + .block_handle_storage + .find_prev_persistent_key_block(block_handle.id().seqno)?; + + if let Some(handle) = &prev_persistent_key_block { + self.last_utime + .store(handle.meta().gen_utime(), Ordering::Release); + } + self.current_persistent_state + .store(prev_persistent_key_block); + + self.initialized.store(true, Ordering::Release); + + self.persistent_state_changed.notify_waiters(); + } + + if !block_handle.is_key_block() { + return Ok(()); + } + + let block_utime = block_handle.meta().gen_utime(); + let prev_utime = self.last_utime(); + + if prev_utime > block_utime { + return Ok(()); + } + + if is_persistent_state(block_utime, prev_utime) { + self.last_utime.store(block_utime, Ordering::Release); + self.current_persistent_state + .store(Some(block_handle.clone())); + self.persistent_state_changed.notify_waiters(); + } + + Ok(()) + } + + pub fn last_utime(&self) -> u32 { + self.last_utime.load(Ordering::Acquire) + } + + pub fn current(&self) -> Option> { + self.current_persistent_state.load_full() + } + + pub fn current_meta(&self) -> Option<(u32, BriefBlockMeta)> { + self.current_persistent_state + .load() + .as_ref() + .map(|handle| (handle.id().seqno, handle.meta().brief())) + } + + pub fn new_state_found(&self) -> tokio::sync::futures::Notified<'_> { + self.persistent_state_changed.notified() + } +} diff --git a/storage/src/store/shard_state/cell_storage.rs b/storage/src/store/shard_state/cell_storage.rs new file mode 100644 index 000000000..6d56d4bec --- /dev/null +++ b/storage/src/store/shard_state/cell_storage.rs @@ -0,0 +1,806 @@ +use std::cell::UnsafeCell; +use std::collections::hash_map; +use std::mem::{ManuallyDrop, MaybeUninit}; +use std::sync::atomic::{AtomicI64, AtomicU8, Ordering}; +use std::sync::{Arc, Weak}; + +use anyhow::{Context, Result}; +use bumpalo::Bump; +use everscale_types::cell::*; +use quick_cache::sync::{Cache, DefaultLifecycle}; +use triomphe::ThinArc; + +use crate::db::*; +use tycho_util::{FastDashMap, FastHashMap, FastHasherState}; + +pub struct CellStorage { + db: Arc, + cells_cache: Arc>>, + raw_cells_cache: RawCellsCache, +} + +impl CellStorage { + pub fn new(db: Arc, cache_size_bytes: u64) -> Arc { + let cells_cache = Default::default(); + let raw_cells_cache = RawCellsCache::new(cache_size_bytes); + + Arc::new(Self { + db, + cells_cache, + raw_cells_cache, + }) + } + + pub fn store_cell( + &self, + batch: &mut rocksdb::WriteBatch, + root: Cell, + ) -> Result { + struct CellWithRefs<'a> { + rc: u32, + data: Option<&'a [u8]>, + } + + struct Context<'a> { + db: &'a Db, + raw_cache: &'a RawCellsCache, + alloc: &'a Bump, + transaction: FastHashMap>, + buffer: Vec, + } + + impl Context<'_> { + fn insert_cell( + &mut self, + key: &HashBytes, + cell: &DynCell, + depth: usize, + ) -> Result { + Ok(match self.transaction.entry(*key) { + hash_map::Entry::Occupied(mut value) => { + value.get_mut().rc += 1; + false + } + hash_map::Entry::Vacant(entry) => { + // A constant which tells since which depth we should start to use cache. + // This method is used mostly for inserting new states, so we can assume + // that first N levels will mostly be new. + // + // This value was chosen empirically. + const NEW_CELLS_DEPTH_THRESHOLD: usize = 4; + + let (old_rc, has_value) = 'value: { + if depth >= NEW_CELLS_DEPTH_THRESHOLD { + // NOTE: `get` here is used to affect a "hotness" of the value, because + // there is a big chance that we will need it soon during state processing + if let Some(entry) = self.raw_cache.0.get(key) { + let rc = entry.header.header.load(Ordering::Acquire); + break 'value (rc, rc > 0); + } + } + + match self + .db + .cells + .get(key.as_slice()) + .map_err(CellStorageError::Internal)? + { + Some(value) => { + let (rc, value) = + refcount::decode_value_with_rc(value.as_ref()); + (rc, value.is_some()) + } + None => (0, false), + } + }; + + // TODO: lower to `debug_assert` when sure + assert!(has_value && old_rc > 0 || !has_value && old_rc == 0); + + let data = if !has_value { + self.buffer.clear(); + if StorageCell::serialize_to(cell, &mut self.buffer).is_err() { + return Err(CellStorageError::InvalidCell); + } + Some(self.alloc.alloc_slice_copy(self.buffer.as_slice()) as &[u8]) + } else { + None + }; + entry.insert(CellWithRefs { rc: 1, data }); + !has_value + } + }) + } + + fn finalize(mut self, batch: &mut rocksdb::WriteBatch) -> usize { + let total = self.transaction.len(); + let cells_cf = &self.db.cells.cf(); + for (key, CellWithRefs { rc, data }) in self.transaction { + self.buffer.clear(); + refcount::add_positive_refount(rc, data, &mut self.buffer); + if let Some(data) = data { + self.raw_cache.insert(&key, rc, data); + } else { + self.raw_cache.add_refs(&key, rc); + } + batch.merge_cf(cells_cf, key.as_slice(), &self.buffer); + } + total + } + } + + // Prepare context and handles + let alloc = Bump::new(); + + let mut ctx = Context { + db: &self.db, + raw_cache: &self.raw_cells_cache, + alloc: &alloc, + transaction: FastHashMap::with_capacity_and_hasher(128, Default::default()), + buffer: Vec::with_capacity(512), + }; + + // Check root cell + { + let key = root.repr_hash(); + + if !ctx.insert_cell(key, root.as_ref(), 0)? { + return Ok(0); + } + } + + let mut stack = Vec::with_capacity(16); + stack.push(root.references()); + + // Check other cells + 'outer: loop { + let depth = stack.len(); + let Some(iter) = stack.last_mut() else { + break; + }; + + for child in &mut *iter { + let key = child.repr_hash(); + + if ctx.insert_cell(key, child, depth)? { + stack.push(child.references()); + continue 'outer; + } + } + + stack.pop(); + } + + // Clear big chunks of data before finalization + drop(stack); + + // Write transaction to the `WriteBatch` + Ok(ctx.finalize(batch)) + } + + pub fn load_cell( + self: &Arc, + hash: HashBytes, + ) -> Result, CellStorageError> { + if let Some(cell) = self.cells_cache.get(&hash) { + if let Some(cell) = cell.upgrade() { + return Ok(cell); + } + } + + let cell = match self.raw_cells_cache.get_raw(self.db.as_ref(), &hash) { + Ok(value) => 'cell: { + if let Some(value) = value { + let rc = &value.header.header; + if rc.load(Ordering::Acquire) > 0 { + match StorageCell::deserialize(self.clone(), &value.slice) { + Some(cell) => break 'cell Arc::new(cell), + None => return Err(CellStorageError::InvalidCell), + } + } + } + return Err(CellStorageError::CellNotFound); + } + Err(e) => return Err(CellStorageError::Internal(e)), + }; + self.cells_cache.insert(hash, Arc::downgrade(&cell)); + + Ok(cell) + } + + pub fn remove_cell( + &self, + batch: &mut weedb::rocksdb::WriteBatch, + alloc: &Bump, + hash: &HashBytes, + ) -> Result { + #[derive(Clone, Copy)] + struct CellState<'a> { + rc: i64, + removes: u32, + refs: &'a [HashBytes], + } + + impl<'a> CellState<'a> { + fn remove(&mut self) -> Result, CellStorageError> { + self.removes += 1; + if self.removes as i64 <= self.rc { + Ok(self.next_refs()) + } else { + Err(CellStorageError::CounterMismatch) + } + } + + fn next_refs(&self) -> Option<&'a [HashBytes]> { + if self.rc > self.removes as i64 { + None + } else { + Some(self.refs) + } + } + } + + let cells = &self.db.cells; + let cells_cf = &cells.cf(); + + let mut transaction: FastHashMap<&HashBytes, CellState<'_>> = + FastHashMap::with_capacity_and_hasher(128, Default::default()); + let mut buffer = Vec::with_capacity(4); + + let mut stack = Vec::with_capacity(16); + stack.push(hash); + + // While some cells left + while let Some(cell_id) = stack.pop() { + let refs = match transaction.entry(cell_id) { + hash_map::Entry::Occupied(mut v) => v.get_mut().remove()?, + hash_map::Entry::Vacant(v) => { + let rc = match self.db.cells.get(cell_id.as_array()) { + Ok(value) => 'rc: { + if let Some(value) = value { + buffer.clear(); + if let (rc, Some(value)) = refcount::decode_value_with_rc(&value) { + if StorageCell::deserialize_references(value, &mut buffer) { + break 'rc rc; + } else { + return Err(CellStorageError::InvalidCell); + } + } + } + return Err(CellStorageError::CellNotFound); + } + Err(e) => return Err(CellStorageError::Internal(e)), + }; + + v.insert(CellState { + rc, + removes: 1, + refs: alloc.alloc_slice_copy(buffer.as_slice()), + }) + .next_refs() + } + }; + + if let Some(refs) = refs { + // Add all children + for cell_id in refs { + // Unknown cell, push to the stack to process it + stack.push(cell_id); + } + } + } + + // Clear big chunks of data before finalization + drop(stack); + + // Write transaction to the `WriteBatch` + let total = transaction.len(); + for (key, CellState { removes, .. }) in transaction { + batch.merge_cf( + cells_cf, + key.as_slice(), + refcount::encode_negative_refcount(removes), + ); + } + Ok(total) + } + + pub fn drop_cell(&self, hash: &HashBytes) { + self.cells_cache.remove(hash); + } +} + +#[derive(thiserror::Error, Debug)] +pub enum CellStorageError { + #[error("Cell not found in cell db")] + CellNotFound, + #[error("Invalid cell")] + InvalidCell, + #[error("Cell counter mismatch")] + CounterMismatch, + #[error("Internal rocksdb error")] + Internal(#[source] weedb::rocksdb::Error), +} + +pub struct StorageCell { + cell_storage: Arc, + descriptor: CellDescriptor, + bit_len: u16, + data: Vec, + hashes: Vec<(HashBytes, u16)>, + + reference_states: [AtomicU8; 4], + reference_data: [UnsafeCell; 4], +} + +impl StorageCell { + const REF_EMPTY: u8 = 0x0; + const REF_RUNNING: u8 = 0x1; + const REF_STORAGE: u8 = 0x2; + const REF_REPLACED: u8 = 0x3; + + pub fn deserialize(cell_storage: Arc, buffer: &[u8]) -> Option { + if buffer.len() < 4 { + return None; + } + + let descriptor = CellDescriptor::new([buffer[0], buffer[1]]); + let bit_len = u16::from_le_bytes([buffer[2], buffer[3]]); + let byte_len = descriptor.byte_len() as usize; + let hash_count = descriptor.hash_count() as usize; + let ref_count = descriptor.reference_count() as usize; + + let total_len = 4usize + byte_len + (32 + 2) * hash_count + 32 * ref_count; + if buffer.len() < total_len { + return None; + } + + let data = buffer[4..4 + byte_len].to_vec(); + + let mut hashes = Vec::with_capacity(hash_count); + let mut offset = 4 + byte_len; + for _ in 0..hash_count { + hashes.push(( + HashBytes::from_slice(&buffer[offset..offset + 32]), + u16::from_le_bytes([buffer[offset + 32], buffer[offset + 33]]), + )); + offset += 32 + 2; + } + + let reference_states = Default::default(); + let reference_data = unsafe { + MaybeUninit::<[UnsafeCell; 4]>::uninit().assume_init() + }; + + for slot in reference_data.iter().take(ref_count) { + let slot = slot.get().cast::(); + unsafe { std::ptr::copy_nonoverlapping(buffer.as_ptr().add(offset), slot, 32) }; + offset += 32; + } + + Some(Self { + cell_storage, + bit_len, + descriptor, + data, + hashes, + reference_states, + reference_data, + }) + } + + pub fn deserialize_references(data: &[u8], target: &mut Vec) -> bool { + if data.len() < 4 { + return false; + } + + let descriptor = CellDescriptor::new([data[0], data[1]]); + let hash_count = descriptor.hash_count(); + let ref_count = descriptor.reference_count() as usize; + + let mut offset = 4usize + descriptor.byte_len() as usize + (32 + 2) * hash_count as usize; + if data.len() < offset + 32 * ref_count { + return false; + } + + target.reserve(ref_count); + for _ in 0..ref_count { + target.push(HashBytes::from_slice(&data[offset..offset + 32])); + offset += 32; + } + + true + } + + pub fn serialize_to(cell: &DynCell, target: &mut Vec) -> Result<()> { + let descriptor = cell.descriptor(); + let hash_count = descriptor.hash_count(); + let ref_count = descriptor.reference_count(); + + target.reserve( + 4usize + + descriptor.byte_len() as usize + + (32 + 2) * hash_count as usize + + 32 * ref_count as usize, + ); + + target.extend_from_slice(&[descriptor.d1, descriptor.d2]); + target.extend_from_slice(&cell.bit_len().to_le_bytes()); + target.extend_from_slice(cell.data()); + assert_eq!(cell.data().len(), descriptor.byte_len() as usize); + + for i in 0..descriptor.hash_count() { + target.extend_from_slice(cell.hash(i).as_array()); + target.extend_from_slice(&cell.depth(i).to_le_bytes()); + } + + for i in 0..descriptor.reference_count() { + let cell = cell.reference(i).context("Child not found")?; + target.extend_from_slice(cell.repr_hash().as_array()); + } + + Ok(()) + } + + pub fn reference_raw(&self, index: u8) -> Option<&Arc> { + if index > 3 || index >= self.descriptor.reference_count() { + return None; + } + + let state = &self.reference_states[index as usize]; + let slot = self.reference_data[index as usize].get(); + + let current_state = state.load(Ordering::Acquire); + if current_state == Self::REF_STORAGE { + return Some(unsafe { &(*slot).storage_cell }); + } + + let mut res = Ok(()); + Self::initialize_inner(state, &mut || match self + .cell_storage + .load_cell(unsafe { (*slot).hash }) + { + Ok(cell) => unsafe { + *slot = StorageCellReferenceData { + storage_cell: ManuallyDrop::new(cell), + }; + true + }, + Err(err) => { + res = Err(err); + false + } + }); + + // TODO: just return none? + res.unwrap(); + + Some(unsafe { &(*slot).storage_cell }) + } + + // Note: this is intentionally monomorphic + #[inline(never)] + fn initialize_inner(state: &AtomicU8, init: &mut dyn FnMut() -> bool) { + struct Guard<'a> { + state: &'a AtomicU8, + new_state: u8, + } + + impl<'a> Drop for Guard<'a> { + fn drop(&mut self) { + self.state.store(self.new_state, Ordering::Release); + unsafe { + let key = self.state as *const AtomicU8 as usize; + parking_lot_core::unpark_all(key, parking_lot_core::DEFAULT_UNPARK_TOKEN); + } + } + } + + loop { + let exchange = state.compare_exchange_weak( + Self::REF_EMPTY, + Self::REF_RUNNING, + Ordering::Acquire, + Ordering::Acquire, + ); + match exchange { + Ok(_) => { + let mut guard = Guard { + state, + new_state: Self::REF_EMPTY, + }; + if init() { + guard.new_state = Self::REF_STORAGE; + } + return; + } + Err(Self::REF_STORAGE) => return, + Err(Self::REF_RUNNING) => unsafe { + let key = state as *const AtomicU8 as usize; + parking_lot_core::park( + key, + || state.load(Ordering::Relaxed) == Self::REF_RUNNING, + || (), + |_, _| (), + parking_lot_core::DEFAULT_PARK_TOKEN, + None, + ); + }, + Err(Self::REF_EMPTY) => (), + Err(_) => debug_assert!(false), + } + } + } +} + +impl CellImpl for StorageCell { + fn descriptor(&self) -> CellDescriptor { + self.descriptor + } + + fn data(&self) -> &[u8] { + &self.data + } + + fn bit_len(&self) -> u16 { + self.bit_len + } + + fn reference(&self, index: u8) -> Option<&DynCell> { + Some(self.reference_raw(index)?.as_ref()) + } + + fn reference_cloned(&self, index: u8) -> Option { + Some(Cell::from(self.reference_raw(index)?.clone() as Arc<_>)) + } + + fn virtualize(&self) -> &DynCell { + VirtualCellWrapper::wrap(self) + } + + fn hash(&self, level: u8) -> &HashBytes { + let i = self.descriptor.level_mask().hash_index(level); + &self.hashes[i as usize].0 + } + + fn depth(&self, level: u8) -> u16 { + let i = self.descriptor.level_mask().hash_index(level); + self.hashes[i as usize].1 + } + + fn take_first_child(&mut self) -> Option { + let state = self.reference_states[0].swap(Self::REF_EMPTY, Ordering::AcqRel); + let data = self.reference_data[0].get_mut(); + match state { + Self::REF_STORAGE => Some(unsafe { data.take_storage_cell() }), + Self::REF_REPLACED => Some(unsafe { data.take_replaced_cell() }), + _ => None, + } + } + + fn replace_first_child(&mut self, parent: Cell) -> std::result::Result { + let state = self.reference_states[0].load(Ordering::Acquire); + if state < Self::REF_STORAGE { + return Err(parent); + } + + self.reference_states[0].store(Self::REF_REPLACED, Ordering::Release); + let data = self.reference_data[0].get_mut(); + + let cell = match state { + Self::REF_STORAGE => unsafe { data.take_storage_cell() }, + Self::REF_REPLACED => unsafe { data.take_replaced_cell() }, + _ => return Err(parent), + }; + data.replaced_cell = ManuallyDrop::new(parent); + Ok(cell) + } + + fn take_next_child(&mut self) -> Option { + while self.descriptor.reference_count() > 1 { + self.descriptor.d1 -= 1; + let idx = (self.descriptor.d1 & CellDescriptor::REF_COUNT_MASK) as usize; + + let state = self.reference_states[idx].swap(Self::REF_EMPTY, Ordering::AcqRel); + let data = self.reference_data[idx].get_mut(); + + return Some(match state { + Self::REF_STORAGE => unsafe { data.take_storage_cell() }, + Self::REF_REPLACED => unsafe { data.take_replaced_cell() }, + _ => continue, + }); + } + + None + } +} + +impl Drop for StorageCell { + fn drop(&mut self) { + self.cell_storage.drop_cell(DynCell::repr_hash(self)); + for i in 0..4 { + let state = self.reference_states[i].load(Ordering::Acquire); + let data = self.reference_data[i].get_mut(); + + unsafe { + match state { + Self::REF_STORAGE => ManuallyDrop::drop(&mut data.storage_cell), + Self::REF_REPLACED => ManuallyDrop::drop(&mut data.replaced_cell), + _ => {} + } + } + } + } +} + +unsafe impl Send for StorageCell {} +unsafe impl Sync for StorageCell {} + +pub union StorageCellReferenceData { + /// Incplmete state. + hash: HashBytes, + /// Complete state. + storage_cell: ManuallyDrop>, + /// Replaced state. + replaced_cell: ManuallyDrop, +} + +impl StorageCellReferenceData { + unsafe fn take_storage_cell(&mut self) -> Cell { + Cell::from(ManuallyDrop::take(&mut self.storage_cell) as Arc<_>) + } + + unsafe fn take_replaced_cell(&mut self) -> Cell { + ManuallyDrop::take(&mut self.replaced_cell) + } +} + +struct RawCellsCache(Cache); + +impl RawCellsCache { + pub(crate) fn hit_ratio(&self) -> f64 { + (if self.0.hits() > 0 { + self.0.hits() as f64 / (self.0.hits() + self.0.misses()) as f64 + } else { + 0.0 + }) * 100.0 + } +} + +type RawCellsCacheItem = ThinArc; + +#[derive(Clone, Copy)] +pub struct CellSizeEstimator; +impl quick_cache::Weighter for CellSizeEstimator { + fn weight(&self, key: &HashBytes, val: &RawCellsCacheItem) -> u32 { + const STATIC_SIZE: usize = std::mem::size_of::() + + std::mem::size_of::() + + std::mem::size_of::() * 2; // ArcInner refs + HeaderWithLength length + + let len = key.0.len() + val.slice.len() + STATIC_SIZE; + len as u32 + } +} + +impl RawCellsCache { + fn new(size_in_bytes: u64) -> Self { + // Percentile 0.1% from 96 to 127 => 1725119 count + // Percentile 10% from 128 to 191 => 82838849 count + // Percentile 25% from 128 to 191 => 82838849 count + // Percentile 50% from 128 to 191 => 82838849 count + // Percentile 75% from 128 to 191 => 82838849 count + // Percentile 90% from 192 to 255 => 22775080 count + // Percentile 95% from 192 to 255 => 22775080 count + // Percentile 99% from 192 to 255 => 22775080 count + // Percentile 99.9% from 256 to 383 => 484002 count + // Percentile 99.99% from 256 to 383 => 484002 count + // Percentile 99.999% from 256 to 383 => 484002 count + + // from 64 to 95 - 15_267 + // from 96 to 127 - 1_725_119 + // from 128 to 191 - 82_838_849 + // from 192 to 255 - 22_775_080 + // from 256 to 383 - 484_002 + + // we assume that 75% of cells are in range 128..191 + // so we can use use 192 as size for value in cache + + const MAX_CELL_SIZE: u64 = 192; + const KEY_SIZE: u64 = 32; + + let estimated_cell_cache_capacity = size_in_bytes / (KEY_SIZE + MAX_CELL_SIZE); + tracing::info!( + estimated_cell_cache_capacity, + max_cell_cache_size = %bytesize::ByteSize(size_in_bytes), + ); + + let raw_cache = Cache::with( + estimated_cell_cache_capacity as usize, + size_in_bytes, + CellSizeEstimator, + FastHasherState::default(), + DefaultLifecycle::default(), + ); + + Self(raw_cache) + } + + fn get_raw( + &self, + db: &Db, + key: &HashBytes, + ) -> Result, rocksdb::Error> { + use quick_cache::GuardResult; + + match self.0.get_value_or_guard(key, None) { + GuardResult::Value(value) => Ok(Some(value)), + GuardResult::Guard(g) => Ok(if let Some(value) = db.cells.get(key.as_slice())? { + let (rc, data) = refcount::decode_value_with_rc(value.as_ref()); + data.map(|value| { + let value = RawCellsCacheItem::from_header_and_slice(AtomicI64::new(rc), value); + _ = g.insert(value.clone()); + value + }) + } else { + None + }), + GuardResult::Timeout => unreachable!(), + } + } + + fn get_raw_for_delete( + &self, + db: &Db, + key: &HashBytes, + refs_buffer: &mut Vec, + ) -> Result { + refs_buffer.clear(); + + // NOTE: `peek` here is used to avoid affecting a "hotness" of the value + if let Some(value) = self.0.peek(key) { + let rc = value.header.header.load(Ordering::Acquire); + if rc <= 0 { + return Err(CellStorageError::CellNotFound); + } + + StorageCell::deserialize_references(&value.slice, refs_buffer) + .then_some(rc) + .ok_or(CellStorageError::InvalidCell) + } else { + match db.cells.get(key.as_slice()) { + Ok(value) => { + if let Some(value) = value { + if let (rc, Some(value)) = refcount::decode_value_with_rc(&value) { + return StorageCell::deserialize_references(value, refs_buffer) + .then_some(rc) + .ok_or(CellStorageError::InvalidCell); + } + } + + Err(CellStorageError::CellNotFound) + } + Err(e) => Err(CellStorageError::Internal(e)), + } + } + } + + fn insert(&self, key: &HashBytes, refs: u32, value: &[u8]) { + let value = RawCellsCacheItem::from_header_and_slice(AtomicI64::new(refs as _), value); + self.0.insert(*key, value); + } + + fn add_refs(&self, key: &HashBytes, refs: u32) { + // NOTE: `peek` here is used to avoid affecting a "hotness" of the value + if let Some(v) = self.0.peek(key) { + v.header.header.fetch_add(refs as i64, Ordering::Release); + } + } + + fn remove_refs(&self, key: &HashBytes, refs: u32) { + // NOTE: `peek` here is used to avoid affecting a "hotness" of the value + if let Some(v) = self.0.peek(key) { + let old_refs = v.header.header.fetch_sub(refs as i64, Ordering::Release); + debug_assert!(old_refs >= refs as i64); + } + } +} diff --git a/storage/src/store/shard_state/entries_buffer.rs b/storage/src/store/shard_state/entries_buffer.rs new file mode 100644 index 000000000..a2f9896eb --- /dev/null +++ b/storage/src/store/shard_state/entries_buffer.rs @@ -0,0 +1,182 @@ +use everscale_types::cell::{CellType, LevelMask}; + +pub struct EntriesBuffer(Box<[[u8; HashesEntry::LEN]; 5]>); + +impl EntriesBuffer { + pub fn new() -> Self { + Self(Box::new([[0; HashesEntry::LEN]; 5])) + } + + pub fn current_entry_buffer(&mut self) -> &mut [u8; HashesEntry::LEN] { + &mut self.0[0] + } + + pub fn iter_child_buffers( + &mut self, + ) -> impl Iterator::LEN]> { + self.0.iter_mut().skip(1) + } + + pub fn split_children<'a, 'b>( + &'a mut self, + references: &'b [u32], + ) -> (HashesEntryWriter<'a>, EntriesBufferChildren<'b>) + where + 'a: 'b, + { + let [first, tail @ ..] = &mut *self.0; + ( + HashesEntryWriter(first), + EntriesBufferChildren(references, tail), + ) + } + + pub fn repr_hash(&self) -> &[u8; 32] { + let [first, ..] = &*self.0; + HashesEntry(first).hash(3) + } +} + +pub struct EntriesBufferChildren<'a>(&'a [u32], &'a [[u8; HashesEntry::LEN]]); + +impl EntriesBufferChildren<'_> { + pub fn iter(&self) -> impl Iterator)> { + self.0 + .iter() + .zip(self.1) + .map(|(index, item)| (index, HashesEntry(item))) + } +} + +pub struct HashesEntryWriter<'a>(&'a mut [u8; HashesEntry::LEN]); + +impl HashesEntryWriter<'_> { + pub fn as_reader(&self) -> HashesEntry<'_> { + HashesEntry(self.0) + } + + pub fn clear(&mut self) { + for byte in &mut *self.0 { + *byte = 0; + } + } + + pub fn set_level_mask(&mut self, level_mask: LevelMask) { + self.0[0] = level_mask.into(); + } + + pub fn set_cell_type(&mut self, cell_type: CellType) { + self.0[1] = cell_type.into(); + } + + pub fn set_tree_bits_count(&mut self, count: u64) { + self.0[4..12].copy_from_slice(&count.to_le_bytes()); + } + + pub fn set_tree_cell_count(&mut self, count: u64) { + self.0[12..20].copy_from_slice(&count.to_le_bytes()); + } + + pub fn get_tree_counters(&mut self) -> &[u8] { + &self.0[4..20] + } + + pub fn set_hash(&mut self, i: u8, hash: &[u8]) { + self.get_hash_slice(i).copy_from_slice(hash); + } + + pub fn get_hash_slice(&mut self, i: u8) -> &mut [u8; 32] { + let offset = HashesEntry::HASHES_OFFSET + 32 * i as usize; + unsafe { &mut *self.0.as_mut_ptr().add(offset).cast() } + } + + pub fn set_depth(&mut self, i: u8, depth: u16) { + self.get_depth_slice(i) + .copy_from_slice(&depth.to_le_bytes()); + } + + pub fn get_depth_slice(&mut self, i: u8) -> &mut [u8; 2] { + let offset = HashesEntry::DEPTHS_OFFSET + 2 * i as usize; + unsafe { &mut *self.0.as_mut_ptr().add(offset).cast() } + } +} + +pub struct HashesEntry<'a>(&'a [u8; HashesEntry::LEN]); + +impl<'a> HashesEntry<'a> { + // 4 bytes - info (1 byte level mask, 1 byte cell type, 2 bytes padding) + // 8 bytes - tree bits count + // 8 bytes - cell count + // 32 * 4 bytes - hashes + // 2 * 4 bytes - depths + pub const LEN: usize = 4 + 8 + 8 + 32 * 4 + 2 * 4; + pub const HASHES_OFFSET: usize = 4 + 8 + 8; + pub const DEPTHS_OFFSET: usize = 4 + 8 + 8 + 32 * 4; + + pub fn level_mask(&self) -> LevelMask { + // SAFETY: loaded from `set_level_mask` + unsafe { LevelMask::new_unchecked(self.0[0]) } + } + + pub fn cell_type(&self) -> CellType { + match self.0[1] { + 1 => CellType::PrunedBranch, + 2 => CellType::LibraryReference, + 3 => CellType::MerkleProof, + 4 => CellType::MerkleUpdate, + _ => CellType::Ordinary, + } + } + + pub fn tree_bits_count(&self) -> u64 { + u64::from_le_bytes(self.0[4..12].try_into().unwrap()) + } + + pub fn tree_cell_count(&self) -> u64 { + u64::from_le_bytes(self.0[12..20].try_into().unwrap()) + } + + pub fn hash(&self, n: u8) -> &'a [u8; 32] { + let offset = Self::HASHES_OFFSET + 32 * self.level_mask().hash_index(n) as usize; + unsafe { &*self.0.as_ptr().add(offset).cast() } + } + + pub fn depth(&self, n: u8) -> u16 { + let offset = Self::DEPTHS_OFFSET + 2 * self.level_mask().hash_index(n) as usize; + u16::from_le_bytes([self.0[offset], self.0[offset + 1]]) + } + + pub fn pruned_branch_hash<'b>(&self, n: u8, data: &'b [u8]) -> Option<&'b [u8; 32]> + where + 'a: 'b, + { + let level_mask = self.level_mask(); + let index = level_mask.hash_index(n) as usize; + let level = level_mask.level() as usize; + + Some(if index == level { + let offset = Self::HASHES_OFFSET; + unsafe { &*self.0.as_ptr().add(offset).cast() } + } else { + let offset = 1 + 1 + index * 32; + if data.len() < offset + 32 { + return None; + } + unsafe { &*data.as_ptr().add(offset).cast() } + }) + } + + pub fn pruned_branch_depth(&self, n: u8, data: &[u8]) -> u16 { + let level_mask = self.level_mask(); + let index = level_mask.hash_index(n) as usize; + let level = level_mask.level() as usize; + + if index == level { + let offset = Self::DEPTHS_OFFSET; + u16::from_le_bytes([self.0[offset], self.0[offset + 1]]) + } else { + let offset = 1 + 1 + level * 32 + index * 2; + u16::from_be_bytes([data[offset], data[offset + 1]]) + } + } +} diff --git a/storage/src/store/shard_state/mod.rs b/storage/src/store/shard_state/mod.rs new file mode 100644 index 000000000..f4a3bdeda --- /dev/null +++ b/storage/src/store/shard_state/mod.rs @@ -0,0 +1,371 @@ +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Instant; + +use anyhow::{Context, Result}; +use everscale_types::models::*; +use everscale_types::prelude::{Cell, HashBytes}; +use tycho_block_util::block::*; +use tycho_block_util::state::*; + +use self::cell_storage::*; +use self::replace_transaction::ShardStateReplaceTransaction; + +use crate::db::*; +use crate::util::*; +use crate::{models::BlockHandle, BlockHandleStorage, BlockStorage}; + +mod cell_storage; +mod entries_buffer; +mod replace_transaction; +mod shard_state_reader; + +const DOWNLOADS_DIR: &str = "downloads"; + +pub struct ShardStateStorage { + db: Arc, + downloads_dir: FileDb, + + block_handle_storage: Arc, + block_storage: Arc, + cell_storage: Arc, + + gc_lock: tokio::sync::Mutex<()>, + min_ref_mc_state: Arc, + max_new_mc_cell_count: AtomicUsize, + max_new_sc_cell_count: AtomicUsize, +} + +impl ShardStateStorage { + pub fn new( + db: Arc, + files_dir: &FileDb, + block_handle_storage: Arc, + block_storage: Arc, + cache_size_bytes: u64, + ) -> Result { + let downloads_dir = files_dir.subdir(DOWNLOADS_DIR); + downloads_dir.ensure_exists()?; + + let cell_storage = CellStorage::new(db.clone(), cache_size_bytes); + + let res = Self { + db, + block_handle_storage, + block_storage, + cell_storage, + downloads_dir, + gc_lock: Default::default(), + min_ref_mc_state: Arc::new(Default::default()), + max_new_mc_cell_count: AtomicUsize::new(0), + max_new_sc_cell_count: AtomicUsize::new(0), + }; + + // Done + Ok(res) + } + + pub fn metrics(&self) -> ShardStateStorageMetrics { + #[cfg(feature = "count-cells")] + let storage_cell = countme::get::(); + + ShardStateStorageMetrics { + #[cfg(feature = "count-cells")] + storage_cell_live_count: storage_cell.live, + #[cfg(feature = "count-cells")] + storage_cell_max_live_count: storage_cell.max_live, + max_new_mc_cell_count: self.max_new_mc_cell_count.swap(0, Ordering::AcqRel), + max_new_sc_cell_count: self.max_new_sc_cell_count.swap(0, Ordering::AcqRel), + } + } + + // TODO: implement metrics + /*pub fn cache_metrics(&self) -> CacheStats { + self.cell_storage.cache_stats() + }*/ + + pub fn min_ref_mc_state(&self) -> &Arc { + &self.min_ref_mc_state + } + + pub async fn store_state( + &self, + handle: &Arc, + state: &ShardStateStuff, + ) -> Result { + if handle.id() != state.block_id() { + return Err(ShardStateStorageError::BlockHandleIdMismatch.into()); + } + + if handle.meta().has_state() { + return Ok(false); + } + + let block_id = handle.id(); + let cell_id = state.root_cell().repr_hash(); + + let mut batch = weedb::rocksdb::WriteBatch::default(); + + let _gc_lock = self.gc_lock.lock().await; + + let len = self + .cell_storage + .store_cell(&mut batch, state.root_cell().clone())?; + + if block_id.shard.is_masterchain() { + self.max_new_mc_cell_count.fetch_max(len, Ordering::Release); + } else { + self.max_new_sc_cell_count.fetch_max(len, Ordering::Release); + } + + let mut value = [0; 32 * 3]; + value[..32].copy_from_slice(cell_id.as_slice()); + value[32..64].copy_from_slice(block_id.root_hash.as_slice()); + value[64..96].copy_from_slice(block_id.file_hash.as_slice()); + + batch.put_cf( + &self.db.shard_states.cf(), + BlockIdShort { + shard: block_id.shard, + seqno: block_id.seqno, + } + .to_vec(), + value, + ); + + self.db.raw().write(batch)?; + + Ok(if handle.meta().set_has_state() { + self.block_handle_storage.store_handle(handle)?; + true + } else { + false + }) + } + + pub async fn load_state(&self, block_id: &BlockId) -> Result> { + let cell_id = self.load_state_root(block_id.as_short_id())?; + let cell = self.cell_storage.load_cell(cell_id)?; + + ShardStateStuff::new( + *block_id, + Cell::from(cell as Arc<_>), + &self.min_ref_mc_state, + ) + .map(Arc::new) + } + + pub fn begin_replace(&'_ self, block_id: &BlockId) -> Result> { + ShardStateReplaceTransaction::new( + &self.db, + &self.downloads_dir, + &self.cell_storage, + &self.min_ref_mc_state, + block_id, + ) + } + + pub async fn remove_outdated_states(&self, mc_seqno: u32) -> Result { + let _compaction_guard = self.db.delay_compaction().await; + + // Compute recent block ids for the specified masterchain seqno + let top_blocks = self + .compute_recent_blocks(mc_seqno) + .await? + .context("Recent blocks edge not found")?; + + tracing::info!( + block_id = %top_blocks.mc_block, + "starting shard states GC", + ); + let instant = Instant::now(); + + let raw = self.db.raw(); + + // Manually get required column factory and r/w options + let snapshot = raw.snapshot(); + let shard_states_cf = self.db.shard_states.get_unbounded_cf(); + let mut states_read_options = self.db.shard_states.new_read_config(); + states_read_options.set_snapshot(&snapshot); + + let cells_write_options = self.db.cells.write_config(); + + let mut alloc = bumpalo::Bump::new(); + + // Create iterator + let mut iter = raw.raw_iterator_cf_opt(&shard_states_cf.bound(), states_read_options); + iter.seek_to_first(); + + // Iterate all states and remove outdated + let mut removed_states = 0usize; + let mut removed_cells = 0usize; + loop { + let (key, value) = match iter.item() { + Some(item) => item, + None => match iter.status() { + Ok(()) => break, + Err(e) => return Err(e.into()), + }, + }; + + let block_id = BlockIdShort::deserialize(&mut std::convert::identity(key))?; + let root_hash = HashBytes::wrap(value.try_into().expect("invalid value")); + + // Skip blocks from zero state and top blocks + if block_id.seqno == 0 + || top_blocks.contains_shard_seqno(&block_id.shard, block_id.seqno) + { + iter.next(); + continue; + } + + alloc.reset(); + let mut batch = weedb::rocksdb::WriteBatch::default(); + { + let _guard = self.gc_lock.lock().await; + let total = self + .cell_storage + .remove_cell(&mut batch, &alloc, root_hash)?; + batch.delete_cf(&shard_states_cf.bound(), key); + raw.write_opt(batch, cells_write_options)?; + + removed_cells += total; + tracing::debug!( + removed_cells = total, + %block_id, + ); + } + + removed_states += 1; + iter.next(); + } + + // Done + tracing::info!( + removed_states, + removed_cells, + block_id = %top_blocks.mc_block, + elapsed_sec = instant.elapsed().as_secs_f64(), + "finished shard states GC", + ); + Ok(top_blocks) + } + + /// Searches for an edge with the least referenced masterchain block + /// + /// Returns `None` if all states are recent enough + pub async fn compute_recent_blocks(&self, mut mc_seqno: u32) -> Result> { + // 0. Adjust masterchain seqno with minimal referenced masterchain state + if let Some(min_ref_mc_seqno) = self.min_ref_mc_state.seqno() { + if min_ref_mc_seqno < mc_seqno { + mc_seqno = min_ref_mc_seqno; + } + } + + // 1. Find target block + + // Find block id using states table + let mc_block_id = match self + .find_mc_block_id(mc_seqno) + .context("Failed to find block id by seqno")? + { + Some(block_id) => block_id, + None => return Ok(None), + }; + + // Find block handle + let handle = match self.block_handle_storage.load_handle(&mc_block_id)? { + Some(handle) if handle.meta().has_data() => handle, + // Skip blocks without handle or data + _ => return Ok(None), + }; + + // 2. Find minimal referenced masterchain block from the target block + + let block_data = self.block_storage.load_block_data(&handle).await?; + let block_info = block_data + .block() + .load_info() + .context("Failed to read target block info")?; + + // Find full min masterchain reference id + let min_ref_mc_seqno = block_info.min_ref_mc_seqno; + let min_ref_block_id = match self.find_mc_block_id(min_ref_mc_seqno)? { + Some(block_id) => block_id, + None => return Ok(None), + }; + + // Find block handle + let min_ref_block_handle = match self + .block_handle_storage + .load_handle(&min_ref_block_id) + .context("Failed to find min ref mc block handle")? + { + Some(handle) if handle.meta().has_data() => handle, + // Skip blocks without handle or data + _ => return Ok(None), + }; + + // Compute `TopBlocks` from block data + self.block_storage + .load_block_data(&min_ref_block_handle) + .await + .and_then(|block_data| TopBlocks::from_mc_block(&block_data)) + .map(Some) + } + + fn load_state_root(&self, block_id_short: BlockIdShort) -> Result { + let shard_states = &self.db.shard_states; + let shard_state = shard_states.get(block_id_short.to_vec())?; + match shard_state { + Some(root) => Ok(HashBytes::from_slice(&root[..32])), + None => Err(ShardStateStorageError::NotFound.into()), + } + } + + fn find_mc_block_id(&self, mc_seqno: u32) -> Result> { + let shard_states = &self.db.shard_states; + Ok(shard_states + .get( + BlockIdShort { + shard: ShardIdent::MASTERCHAIN, + seqno: mc_seqno, + } + .to_vec(), + )? + .and_then(|value| { + let value = value.as_ref(); + if value.len() < 96 { + return None; + } + + let root_hash: [u8; 32] = value[32..64].try_into().unwrap(); + let file_hash: [u8; 32] = value[64..96].try_into().unwrap(); + + Some(BlockId { + shard: ShardIdent::MASTERCHAIN, + seqno: mc_seqno, + root_hash: HashBytes(root_hash), + file_hash: HashBytes(file_hash), + }) + })) + } +} + +#[derive(Debug, Copy, Clone)] +pub struct ShardStateStorageMetrics { + #[cfg(feature = "count-cells")] + pub storage_cell_live_count: usize, + #[cfg(feature = "count-cells")] + pub storage_cell_max_live_count: usize, + pub max_new_mc_cell_count: usize, + pub max_new_sc_cell_count: usize, +} + +#[derive(thiserror::Error, Debug)] +enum ShardStateStorageError { + #[error("Not found")] + NotFound, + #[error("Block handle id mismatch")] + BlockHandleIdMismatch, +} diff --git a/storage/src/store/shard_state/replace_transaction.rs b/storage/src/store/shard_state/replace_transaction.rs new file mode 100644 index 000000000..b33421ddc --- /dev/null +++ b/storage/src/store/shard_state/replace_transaction.rs @@ -0,0 +1,560 @@ +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::{Context, Result}; +use everscale_types::cell::*; +use everscale_types::models::BlockId; + +use super::cell_storage::*; +use super::entries_buffer::*; +use super::shard_state_reader::*; +use crate::db::*; +use crate::util::*; + +use tycho_block_util::state::*; +use tycho_util::progress_bar::*; +use tycho_util::FastHashMap; + +pub struct ShardStateReplaceTransaction<'a> { + db: &'a Db, + cell_storage: &'a Arc, + min_ref_mc_state: &'a Arc, + reader: ShardStatePacketReader, + header: Option, + cells_read: u64, + file_ctx: FilesContext, +} + +impl<'a> ShardStateReplaceTransaction<'a> { + pub fn new( + db: &'a Db, + downloads_dir: &FileDb, + cell_storage: &'a Arc, + min_ref_mc_state: &'a Arc, + block_id: &BlockId, + ) -> Result { + let file_ctx = FilesContext::new(downloads_dir, block_id)?; + + Ok(Self { + db, + file_ctx, + cell_storage, + min_ref_mc_state, + reader: ShardStatePacketReader::new(), + header: None, + cells_read: 0, + }) + } + + pub fn header(&self) -> &Option { + &self.header + } + + pub fn process_packet( + &mut self, + packet: Vec, + progress_bar: &mut ProgressBar, + ) -> Result { + let cells_file = self.file_ctx.cells_file()?; + + self.reader.set_next_packet(packet); + + let header = loop { + if let Some(header) = &self.header { + break header; + } + + let header = match self.reader.read_header()? { + Some(header) => header, + None => { + return Ok(false); + } + }; + + tracing::debug!(?header); + progress_bar.set_total(header.cell_count); + + self.header = Some(header); + }; + + let mut chunk_size = 0u32; + let mut buffer = [0; 256]; // At most 2 + 128 + 4 * 4 + + while self.cells_read < header.cell_count { + let cell_size = match self.reader.read_cell(header.ref_size, &mut buffer)? { + Some(cell_size) => cell_size, + None => break, + }; + + buffer[cell_size] = cell_size as u8; + cells_file.write_all(&buffer[..cell_size + 1])?; + + chunk_size += cell_size as u32 + 1; + self.cells_read += 1; + } + + progress_bar.set_progress(self.cells_read); + + if chunk_size > 0 { + tracing::debug!(chunk_size, "creating chunk"); + let bytes = cells_file.write(&chunk_size.to_le_bytes())?; + tracing::trace!(bytes, "writing cells to file"); + } + + if self.cells_read < header.cell_count { + return Ok(false); + } + + if header.has_crc && self.reader.read_crc()?.is_none() { + return Ok(false); + } + + progress_bar.complete(); + Ok(true) + } + + pub fn finalize( + mut self, + block_id: BlockId, + progress_bar: &mut ProgressBar, + ) -> Result> { + // 2^7 bits + 1 bytes + const MAX_DATA_SIZE: usize = 128; + const CELLS_PER_BATCH: u64 = 1_000_000; + + let header = match &self.header { + Some(header) => header, + None => { + return Err(ReplaceTransactionError::InvalidShardStatePacket) + .context("BOC header not found"); + } + }; + + let hashes_file = self + .file_ctx + .create_mapped_hashes_file(header.cell_count as usize * HashesEntry::LEN)?; + + let cells_file = self.file_ctx.create_mapped_cells_file()?; + + let raw = self.db.raw().as_ref(); + let write_options = self.db.cells.new_write_config(); + + let mut tail = [0; 4]; + let mut ctx = FinalizationContext::new(self.db); + + // Allocate on heap to prevent big future size + let mut chunk_buffer = Vec::with_capacity(1 << 20); + let mut data_buffer = vec![0u8; MAX_DATA_SIZE]; + + let total_size = cells_file.length(); + progress_bar.set_total(total_size as u64); + + let mut file_pos = total_size; + let mut cell_index = header.cell_count; + let mut batch_len = 0; + while file_pos >= 4 { + file_pos -= 4; + unsafe { cells_file.read_exact_at(file_pos, &mut tail) }; + + let mut chunk_size = u32::from_le_bytes(tail) as usize; + chunk_buffer.resize(chunk_size, 0); + + file_pos -= chunk_size; + unsafe { cells_file.read_exact_at(file_pos, &mut chunk_buffer) }; + + tracing::debug!(chunk_size, "processing chunk"); + + while chunk_size > 0 { + cell_index -= 1; + batch_len += 1; + let cell_size = chunk_buffer[chunk_size - 1] as usize; + chunk_size -= cell_size + 1; + + let cell = RawCell::from_stored_data( + &mut &chunk_buffer[chunk_size..chunk_size + cell_size], + header.ref_size, + header.cell_count as usize, + cell_index as usize, + &mut data_buffer, + )?; + + for (&index, buffer) in cell + .reference_indices + .iter() + .zip(ctx.entries_buffer.iter_child_buffers()) + { + // SAFETY: `buffer` is guaranteed to be in separate memory area + unsafe { hashes_file.read_exact_at(index as usize * HashesEntry::LEN, buffer) } + } + + ShardStateReplaceTransaction::finalize_cell(&mut ctx, cell_index as u32, cell)?; + + // SAFETY: `entries_buffer` is guaranteed to be in separate memory area + unsafe { + hashes_file.write_all_at( + cell_index as usize * HashesEntry::LEN, + ctx.entries_buffer.current_entry_buffer(), + ); + }; + + chunk_buffer.truncate(chunk_size); + } + + if batch_len > CELLS_PER_BATCH { + ctx.finalize_cell_usages(); + raw.write_opt(std::mem::take(&mut ctx.write_batch), &write_options)?; + batch_len = 0; + } + + progress_bar.set_progress((total_size - file_pos) as u64); + } + + if batch_len > 0 { + ctx.finalize_cell_usages(); + raw.write_opt(std::mem::take(&mut ctx.write_batch), &write_options)?; + } + + // Current entry contains root cell + let root_hash = ctx.entries_buffer.repr_hash(); + ctx.final_check(root_hash)?; + + let shard_state_key = block_id.as_short_id().to_vec(); + self.db.shard_states.insert(&shard_state_key, root_hash)?; + + progress_bar.complete(); + + // Load stored shard state + match self.db.shard_states.get(shard_state_key)? { + Some(root) => { + let cell_id = HashBytes::from_slice(&root[..32]); + + let cell = self.cell_storage.load_cell(cell_id)?; + Ok(Arc::new(ShardStateStuff::new( + block_id, + Cell::from(cell as Arc<_>), + self.min_ref_mc_state, + )?)) + } + None => Err(ReplaceTransactionError::NotFound.into()), + } + } + + fn finalize_cell( + ctx: &mut FinalizationContext<'_>, + cell_index: u32, + cell: RawCell<'_>, + ) -> Result<()> { + use sha2::{Digest, Sha256}; + + let (mut current_entry, children) = + ctx.entries_buffer.split_children(&cell.reference_indices); + + current_entry.clear(); + + // Prepare mask and counters + let mut children_mask = LevelMask::new(0); + let mut tree_bits_count = cell.bit_len as u64; + let mut tree_cell_count = 1; + + for (_, child) in children.iter() { + children_mask |= child.level_mask(); + tree_bits_count += child.tree_bits_count(); + tree_cell_count += child.tree_cell_count(); + } + + let mut is_merkle_cell = false; + let mut is_pruned_cell = false; + let level_mask = match cell.descriptor.cell_type() { + CellType::Ordinary => children_mask, + CellType::PrunedBranch => { + is_pruned_cell = true; + cell.descriptor.level_mask() + } + CellType::LibraryReference => LevelMask::new(0), + CellType::MerkleProof | CellType::MerkleUpdate => { + is_merkle_cell = true; + children_mask.virtualize(1) + } + }; + + if cell.descriptor.level_mask() != level_mask.to_byte() { + return Err(ReplaceTransactionError::InvalidCell).context("Level mask mismatch"); + } + + // Save mask and counters + current_entry.set_level_mask(level_mask); + current_entry.set_cell_type(cell.descriptor.cell_type()); + current_entry.set_tree_bits_count(tree_bits_count); + current_entry.set_tree_cell_count(tree_cell_count); + + // Calculate hashes + let hash_count = if is_pruned_cell { + 1 + } else { + level_mask.level() + 1 + }; + + let mut max_depths = [0u16; 4]; + let mut temp_descriptor = cell.descriptor; + for i in 0..hash_count { + let mut hasher = Sha256::new(); + + let level_mask = if is_pruned_cell { + level_mask + } else { + LevelMask::from_level(i) + }; + + temp_descriptor.d1 &= !(CellDescriptor::LEVEL_MASK | CellDescriptor::STORE_HASHES_MASK); + temp_descriptor.d1 |= u8::from(level_mask) << 5; + hasher.update([temp_descriptor.d1, temp_descriptor.d2]); + + if i == 0 { + hasher.update(cell.data); + } else { + hasher.update(current_entry.get_hash_slice(i - 1)); + } + + for (index, child) in children.iter() { + let child_depth = if child.cell_type().is_pruned_branch() { + let child_data = ctx + .pruned_branches + .get(index) + .ok_or(ReplaceTransactionError::InvalidCell) + .context("Pruned branch data not found")?; + child.pruned_branch_depth(i, child_data) + } else { + child.depth(if is_merkle_cell { i + 1 } else { i }) + }; + hasher.update(child_depth.to_be_bytes()); + + let depth = &mut max_depths[i as usize]; + *depth = std::cmp::max(*depth, child_depth + 1); + + current_entry.set_depth(i, *depth); + } + + for (index, child) in children.iter() { + let child_hash = if child.cell_type().is_pruned_branch() { + let child_data = ctx + .pruned_branches + .get(index) + .ok_or(ReplaceTransactionError::InvalidCell) + .context("Pruned branch data not found")?; + child + .pruned_branch_hash(i, child_data) + .context("Invalid pruned branch")? + } else { + child.hash(if is_merkle_cell { i + 1 } else { i }) + }; + hasher.update(child_hash); + } + + current_entry.set_hash(i, hasher.finalize().as_slice()); + } + + // Update pruned branches + if is_pruned_cell { + ctx.pruned_branches.insert(cell_index, cell.data.to_vec()); + } + + // Write cell data + let output_buffer = &mut ctx.output_buffer; + output_buffer.clear(); + + output_buffer.extend_from_slice(&[ + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + cell.descriptor.d1, + cell.descriptor.d2, + ]); + output_buffer.extend_from_slice(&cell.bit_len.to_le_bytes()); + output_buffer.extend_from_slice(cell.data); + + let hash_count = cell.descriptor.hash_count(); + for i in 0..hash_count { + output_buffer.extend_from_slice(current_entry.get_hash_slice(i)); + output_buffer.extend_from_slice(current_entry.get_depth_slice(i)); + } + + // Write cell references + for (index, child) in children.iter() { + let child_hash = if child.cell_type().is_pruned_branch() { + let child_data = ctx + .pruned_branches + .get(index) + .ok_or(ReplaceTransactionError::InvalidCell) + .context("Pruned branch data not found")?; + child + .pruned_branch_hash(MAX_LEVEL, child_data) + .context("Invalid pruned branch")? + } else { + child.hash(MAX_LEVEL) + }; + + *ctx.cell_usages.entry(*child_hash).or_default() += 1; + output_buffer.extend_from_slice(child_hash); + } + + // // Write counters + // output_buffer.extend_from_slice(current_entry.get_tree_counters()); + + // Save serialized data + let repr_hash = if is_pruned_cell { + current_entry + .as_reader() + .pruned_branch_hash(3, cell.data) + .context("Invalid pruned branch")? + } else { + current_entry.as_reader().hash(MAX_LEVEL) + }; + + ctx.write_batch + .merge_cf(&ctx.cells_cf, repr_hash, output_buffer.as_slice()); + ctx.cell_usages.insert(*repr_hash, -1); + + // Done + Ok(()) + } +} + +struct FinalizationContext<'a> { + pruned_branches: FastHashMap>, + cell_usages: FastHashMap<[u8; 32], i32>, + entries_buffer: EntriesBuffer, + output_buffer: Vec, + cells_cf: BoundedCfHandle<'a>, + write_batch: rocksdb::WriteBatch, +} + +impl<'a> FinalizationContext<'a> { + fn new(db: &'a Db) -> Self { + Self { + pruned_branches: Default::default(), + cell_usages: FastHashMap::with_capacity_and_hasher(128, Default::default()), + entries_buffer: EntriesBuffer::new(), + output_buffer: Vec::with_capacity(1 << 10), + cells_cf: db.cells.cf(), + write_batch: rocksdb::WriteBatch::default(), + } + } + + fn finalize_cell_usages(&mut self) { + self.cell_usages.retain(|key, &mut rc| { + if rc > 0 { + self.write_batch.merge_cf( + &self.cells_cf, + key, + refcount::encode_positive_refcount(rc as u32), + ); + } + + rc < 0 + }); + } + + fn final_check(&self, root_hash: &[u8; 32]) -> Result<()> { + anyhow::ensure!( + self.cell_usages.len() == 1 && self.cell_usages.contains_key(root_hash), + "Invalid shard state cell" + ); + Ok(()) + } +} + +struct FilesContext { + cells_path: PathBuf, + hashes_path: PathBuf, + cells_file: Option, +} + +impl FilesContext { + pub fn new(downloads_dir: &FileDb, block_id: &BlockId) -> Result { + let block_id = format!( + "({},{:016x},{})", + block_id.shard.workchain(), + block_id.shard.prefix(), + block_id.seqno + ); + + let cells_file_name = format!("state_cells_{block_id}"); + let hashes_file_name = format!("state_hashes_{block_id}"); + + let cells_file = downloads_dir + .file(&cells_file_name) + .write(true) + .create(true) + .truncate(true) + .read(true) + .open()?; + + Ok(Self { + cells_path: downloads_dir.path().join(cells_file_name), + hashes_path: downloads_dir.path().join(hashes_file_name), + cells_file: Some(cells_file), + }) + } + + pub fn cells_file(&mut self) -> Result<&mut File> { + match &mut self.cells_file { + Some(file) => Ok(file), + None => Err(FilesContextError::AlreadyFinalized.into()), + } + } + + pub fn create_mapped_hashes_file(&self, length: usize) -> Result { + let mapped_file = MappedFile::new(&self.hashes_path, length)?; + Ok(mapped_file) + } + + pub fn create_mapped_cells_file(&mut self) -> Result { + let file = match self.cells_file.take() { + Some(mut file) => { + file.flush()?; + file + } + None => return Err(FilesContextError::AlreadyFinalized.into()), + }; + + let mapped_file = MappedFile::from_existing_file(file)?; + Ok(mapped_file) + } +} + +impl Drop for FilesContext { + fn drop(&mut self) { + if let Err(e) = std::fs::remove_file(&self.cells_path) { + tracing::error!(file = ?self.cells_path, "failed to remove file: {e}"); + } + + if let Err(e) = std::fs::remove_file(&self.hashes_path) { + tracing::error!(file = ?self.cells_path, "failed to remove file: {e}"); + } + } +} + +#[derive(thiserror::Error, Debug)] +enum ReplaceTransactionError { + #[error("Not found")] + NotFound, + #[error("Invalid shard state packet")] + InvalidShardStatePacket, + #[error("Invalid cell")] + InvalidCell, +} + +#[derive(thiserror::Error, Debug)] +enum FilesContextError { + #[error("Already finalized")] + AlreadyFinalized, +} + +const MAX_LEVEL: u8 = 3; diff --git a/storage/src/store/shard_state/shard_state_reader.rs b/storage/src/store/shard_state/shard_state_reader.rs new file mode 100644 index 000000000..42ba73393 --- /dev/null +++ b/storage/src/store/shard_state/shard_state_reader.rs @@ -0,0 +1,527 @@ +use std::io::Read; + +use anyhow::{Context, Result}; +use crc::{Crc, CRC_32_ISCSI}; +use everscale_types::cell::{CellDescriptor, LevelMask}; +use smallvec::SmallVec; + +macro_rules! try_read { + ($expr:expr) => { + match $expr { + Ok(data) => data, + Err(e) if e.kind() == std::io::ErrorKind::UnexpectedEof => return Ok(None), + Err(e) => return Err(e.into()), + } + }; +} + +pub struct ShardStatePacketReader { + hasher: crc::Digest<'static, u32>, + has_crc: bool, + offset: usize, + current_packet: Vec, + next_packet: Vec, + bytes_to_skip: usize, +} + +impl ShardStatePacketReader { + pub fn new() -> Self { + Self { + hasher: CRC.digest(), + has_crc: true, + offset: 0, + current_packet: Default::default(), + next_packet: Default::default(), + bytes_to_skip: 0, + } + } + + pub fn read_header(&mut self) -> Result> { + const BOC_INDEXED_TAG: u32 = 0x68ff65f3; + const BOC_INDEXED_CRC32_TAG: u32 = 0xacc3a728; + const BOC_GENERIC_TAG: u32 = 0xb5ee9c72; + + if self.process_skip() == ReaderAction::Incomplete { + return Ok(None); + } + + let mut src = self.begin(); + let mut total_size = 0u64; + + let magic = try_read!(src.read_be_u32()); + total_size += 4; + + let first_byte = try_read!(src.read_byte()); + total_size += 1; + + let index_included; + let mut has_crc = false; + let ref_size; + + match magic { + BOC_INDEXED_TAG => { + ref_size = first_byte as usize; + index_included = true; + } + BOC_INDEXED_CRC32_TAG => { + ref_size = first_byte as usize; + index_included = true; + has_crc = true; + } + BOC_GENERIC_TAG => { + index_included = first_byte & 0b1000_0000 != 0; + has_crc = first_byte & 0b0100_0000 != 0; + ref_size = (first_byte & 0b0000_0111) as usize; + } + _ => { + return Err(ShardStateParserError::InvalidShardStateHeader).context("Invalid flags") + } + } + + src.reader.has_crc = has_crc; + + if ref_size == 0 || ref_size > 4 { + return Err(ShardStateParserError::InvalidShardStateHeader) + .context("Ref size must be in range [1;4]"); + } + + let offset_size = try_read!(src.read_byte()) as u64; + total_size += 1; + if offset_size == 0 || offset_size > 8 { + return Err(ShardStateParserError::InvalidShardStateHeader) + .context("Offset size must be in range [1;8]"); + } + + let cell_count = try_read!(src.read_be_uint(ref_size)); + total_size += ref_size as u64; + + let root_count = try_read!(src.read_be_uint(ref_size)); + total_size += ref_size as u64; + + try_read!(src.read_be_uint(ref_size)); // skip absent + total_size += ref_size as u64; + + if root_count != 1 { + return Err(ShardStateParserError::InvalidShardStateHeader) + .context("Expected one root cell"); + } + if root_count > cell_count { + return Err(ShardStateParserError::InvalidShardStateHeader) + .context("Root count is greater then cell count"); + } + + total_size += try_read!(src.read_be_uint(offset_size as usize)); // total cells size + total_size += offset_size; + + let root_index = if magic == BOC_GENERIC_TAG { + let root_index = try_read!(src.read_be_uint(ref_size)); + total_size += ref_size as u64; + root_index + } else { + 0 + }; + + src.end(); + + if index_included { + let index_size = cell_count * offset_size; + total_size += index_size; + self.set_skip(index_size as usize); + } + + if has_crc { + total_size += 4; + } + + Ok(Some(BocHeader { + root_index, + index_included, + has_crc, + ref_size, + offset_size, + cell_count, + total_size, + })) + } + + pub fn read_cell(&mut self, ref_size: usize, buffer: &mut [u8]) -> Result> { + if self.process_skip() == ReaderAction::Incomplete { + return Ok(None); + } + + let mut src = self.begin(); + + let d1 = try_read!(src.read_byte()); + let l = d1 >> 5; + let h = (d1 & 0b0001_0000) != 0; + let r = (d1 & 0b0000_0111) as usize; + let absent = r == 0b111 && h; + + buffer[0] = d1; + + let size = if absent { + let data_size = 32 * ((LevelMask::new(l).level() + 1) as usize); + try_read!(src.read_exact(&mut buffer[1..1 + data_size])); + + tracing::info!("ABSENT"); + + // 1 byte of d1 + fixed data size of absent cell + 1 + data_size + } else { + if r > 4 { + tracing::error!("CELLS: {r}"); + return Err(ShardStateParserError::InvalidShardStateCell) + .context("Cell must contain at most 4 references"); + } + + let d2 = try_read!(src.read_byte()); + buffer[1] = d2; + + // Skip optional precalculated hashes + let hash_count = LevelMask::new(l).level() as usize + 1; + if h && !src.skip(hash_count * (32 + 2)) { + return Ok(None); + } + + let data_size = ((d2 >> 1) + u8::from(d2 & 1 != 0)) as usize; + try_read!(src.read_exact(&mut buffer[2..2 + data_size + r * ref_size])); + + // 2 bytes for d1 and d2 + data size + total references size + 2 + data_size + r * ref_size + }; + + src.end(); + + Ok(Some(size)) + } + + pub fn read_crc(&mut self) -> Result> { + if self.process_skip() == ReaderAction::Incomplete { + return Ok(None); + } + + let current_crc = std::mem::replace(&mut self.hasher, CRC.digest()).finalize(); + + let mut src = self.begin(); + let target_crc = try_read!(src.read_le_u32()); + src.end(); + + if current_crc == target_crc { + Ok(Some(())) + } else { + Err(ShardStateParserError::CrcMismatch.into()) + } + } + + pub fn set_next_packet(&mut self, packet: Vec) { + self.next_packet = packet; + } + + fn begin(&'_ mut self) -> ShardStatePacketReaderTransaction<'_> { + let offset = self.offset; + ShardStatePacketReaderTransaction { + reader: self, + reading_next_packet: false, + offset, + } + } + + fn set_skip(&mut self, n: usize) { + self.bytes_to_skip = n; + } + + fn process_skip(&mut self) -> ReaderAction { + if self.bytes_to_skip == 0 { + return ReaderAction::Complete; + } + + let mut n = std::mem::take(&mut self.bytes_to_skip); + + let remaining = self.current_packet.len() - self.offset; + match n.cmp(&remaining) { + std::cmp::Ordering::Less => { + self.hasher + .update(&self.current_packet[self.offset..self.offset + n]); + self.offset += n; + ReaderAction::Complete + } + std::cmp::Ordering::Equal => { + self.hasher.update(&self.current_packet[self.offset..]); + self.offset = 0; + self.current_packet = std::mem::take(&mut self.next_packet); + ReaderAction::Complete + } + std::cmp::Ordering::Greater => { + n -= remaining; + self.hasher.update(&self.current_packet[self.offset..]); + self.offset = 0; + self.current_packet = std::mem::take(&mut self.next_packet); + + if n > self.current_packet.len() { + n -= self.current_packet.len(); + self.hasher.update(&self.current_packet); + self.current_packet = Vec::new(); + self.bytes_to_skip = n; + ReaderAction::Incomplete + } else { + self.offset = n; + self.hasher.update(&self.current_packet[..self.offset]); + ReaderAction::Complete + } + } + } + } +} + +static CRC: Crc = Crc::::new(&CRC_32_ISCSI); + +#[derive(Debug)] +pub struct BocHeader { + pub root_index: u64, + pub index_included: bool, + pub has_crc: bool, + pub ref_size: usize, + pub offset_size: u64, + pub cell_count: u64, + pub total_size: u64, +} + +pub struct RawCell<'a> { + pub descriptor: CellDescriptor, + pub data: &'a [u8], + pub bit_len: u16, + pub reference_indices: SmallVec<[u32; 4]>, +} + +impl<'a> RawCell<'a> { + pub fn from_stored_data( + src: &mut R, + ref_size: usize, + cell_count: usize, + cell_index: usize, + data_buffer: &'a mut [u8], + ) -> Result + where + R: Read, + { + let mut descriptor = [0u8; 2]; + src.read_exact(&mut descriptor)?; + let descriptor = CellDescriptor::new(descriptor); + let byte_len = descriptor.byte_len() as usize; + let ref_count = descriptor.reference_count() as usize; + + anyhow::ensure!(!descriptor.is_absent(), "Absent cells are not supported"); + + let data = &mut data_buffer[0..byte_len]; + src.read_exact(&mut data[..byte_len])?; + + let mut reference_indices = SmallVec::with_capacity(ref_count); + for _ in 0..ref_count { + let index = src.read_be_uint(ref_size)? as usize; + if index > cell_count || index <= cell_index { + return Err(ShardStateParserError::InvalidShardStateCell) + .context("Reference index out of range"); + } else { + reference_indices.push(index as u32); + } + } + + let bit_len = if descriptor.is_aligned() { + (byte_len * 8) as u16 + } else if let Some(data) = data.last() { + byte_len as u16 * 8 - data.trailing_zeros() as u16 - 1 + } else { + 0 + }; + + Ok(RawCell { + descriptor, + data, + bit_len, + reference_indices, + }) + } +} + +#[derive(Copy, Clone, Eq, PartialEq)] +pub enum ReaderAction { + Incomplete, + Complete, +} + +pub struct ShardStatePacketReaderTransaction<'a> { + reader: &'a mut ShardStatePacketReader, + reading_next_packet: bool, + offset: usize, +} + +impl<'a> ShardStatePacketReaderTransaction<'a> { + pub fn skip(&mut self, mut n: usize) -> bool { + loop { + let current_packet = match self.reading_next_packet { + // Reading non-empty current packet + false if self.offset < self.reader.current_packet.len() => { + &self.reader.current_packet + } + + // Current packet is empty - retry and switch to next + false => { + self.reading_next_packet = true; + self.offset = 0; + continue; + } + + // Reading non-empty next packet + true if self.offset < self.reader.next_packet.len() => &self.reader.next_packet, + + // Reading next packet which is empty + true => return false, + }; + + let skipped = std::cmp::min(current_packet.len() - self.offset, n); + n -= skipped; + self.offset += skipped; + + if n == 0 { + return true; + } + } + } + + pub fn end(self) { + if self.reading_next_packet { + if self.reader.has_crc { + // Write to the hasher until the end of current packet + self.reader + .hasher + .update(&self.reader.current_packet[self.reader.offset..]); + + // Write to the hasher current bytes + self.reader + .hasher + .update(&self.reader.next_packet[..self.offset]); + } + + // Replace current packet + self.reader.current_packet = std::mem::take(&mut self.reader.next_packet); + } else if self.reader.has_crc { + // Write to the hasher current bytes + self.reader + .hasher + .update(&self.reader.current_packet[self.reader.offset..self.offset]); + } + + // Bump offset + self.reader.offset = self.offset; + } +} + +impl<'a> Read for ShardStatePacketReaderTransaction<'a> { + fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result { + let mut result = 0; + + loop { + let current_packet = match self.reading_next_packet { + // Reading non-empty current packet + false if self.offset < self.reader.current_packet.len() => { + &self.reader.current_packet + } + + // Current packet is empty - retry and switch to next + false => { + self.reading_next_packet = true; + self.offset = 0; + continue; + } + + // Reading non-empty next packet + true if self.offset < self.reader.next_packet.len() => &self.reader.next_packet, + + // Reading next packet which is empty + true => { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "packet buffer underflow", + )) + } + }; + + let n = std::cmp::min(current_packet.len() - self.offset, buf.len()); + for i in 0..n { + buf[i] = current_packet[self.offset + i]; + } + + result += n; + self.offset += n; + + let tmp = buf; + buf = &mut tmp[n..]; + + if buf.is_empty() { + return Ok(result); + } + } + } +} + +#[derive(thiserror::Error, Debug)] +enum ShardStateParserError { + #[error("Invalid shard state header")] + InvalidShardStateHeader, + #[error("Invalid shard state cell")] + InvalidShardStateCell, + #[error("Crc mismatch")] + CrcMismatch, +} + +pub trait ByteOrderRead { + fn read_be_uint(&mut self, bytes: usize) -> std::io::Result; + fn read_byte(&mut self) -> std::io::Result; + fn read_be_u32(&mut self) -> std::io::Result; + fn read_le_u32(&mut self) -> std::io::Result; +} + +impl ByteOrderRead for T { + fn read_be_uint(&mut self, bytes: usize) -> std::io::Result { + match bytes { + 1 => { + let mut buf = [0]; + self.read_exact(&mut buf)?; + Ok(buf[0] as u64) + } + 2 => { + let mut buf = [0; 2]; + self.read_exact(&mut buf)?; + Ok(u16::from_be_bytes(buf) as u64) + } + 3..=4 => { + let mut buf = [0; 4]; + self.read_exact(&mut buf[4 - bytes..])?; + Ok(u32::from_be_bytes(buf) as u64) + } + 5..=8 => { + let mut buf = [0; 8]; + self.read_exact(&mut buf[8 - bytes..])?; + Ok(u64::from_be_bytes(buf)) + } + _ => Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "too many bytes to read in u64", + )), + } + } + + fn read_byte(&mut self) -> std::io::Result { + self.read_be_uint(1).map(|value| value as u8) + } + + fn read_be_u32(&mut self) -> std::io::Result { + self.read_be_uint(4).map(|value| value as u32) + } + + fn read_le_u32(&mut self) -> std::io::Result { + let mut buf = [0; 4]; + self.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) + } +} diff --git a/storage/src/util/stored_value.rs b/storage/src/util/stored_value.rs new file mode 100644 index 000000000..c37374412 --- /dev/null +++ b/storage/src/util/stored_value.rs @@ -0,0 +1,246 @@ +use bytes::Buf; +use smallvec::SmallVec; + +use anyhow::Result; +use everscale_types::cell::HashBytes; +use everscale_types::models::{BlockId, BlockIdShort, ShardIdent}; + +/// A trait for writing or reading data from a stack-allocated buffer +pub trait StoredValue { + /// On-stack buffer size hint + const SIZE_HINT: usize; + + /// On-stack buffer type (see [`smallvec::SmallVec`]) + type OnStackSlice: smallvec::Array; + + /// Serializes the data to the buffer + fn serialize(&self, buffer: &mut T); + + /// Deserializes the data from the buffer. + /// + /// In case of successful deserialization it is guaranteed that `reader` will be + /// moved to the end of the deserialized data. + /// + /// NOTE: `reader` should not be used after this call in case of an error + fn deserialize(reader: &mut &[u8]) -> Result + where + Self: Sized; + + /// Deserializes the data from the buffer. + /// + /// [`StoredValue::deserialize`] + #[inline(always)] + fn from_slice(mut data: &[u8]) -> Result + where + Self: Sized, + { + Self::deserialize(&mut data) + } + + /// Constructs on-stack buffer with the serialized object + fn to_vec(&self) -> SmallVec { + let mut result = SmallVec::with_capacity(Self::SIZE_HINT); + self.serialize(&mut result); + result + } +} + +/// A trait for simple buffer-based serialization +pub trait StoredValueBuffer { + fn write_byte(&mut self, byte: u8); + fn write_raw_slice(&mut self, data: &[u8]); +} + +impl StoredValueBuffer for Vec { + #[inline(always)] + fn write_byte(&mut self, byte: u8) { + self.push(byte); + } + + #[inline(always)] + fn write_raw_slice(&mut self, data: &[u8]) { + self.extend_from_slice(data); + } +} + +impl StoredValueBuffer for SmallVec +where + T: smallvec::Array, +{ + #[inline(always)] + fn write_byte(&mut self, byte: u8) { + self.push(byte); + } + + #[inline(always)] + fn write_raw_slice(&mut self, data: &[u8]) { + self.extend_from_slice(data); + } +} + +impl StoredValue for BlockId { + /// 4 bytes workchain, + /// 8 bytes shard, + /// 4 bytes seqno, + /// 32 bytes root hash, + /// 32 bytes file hash + const SIZE_HINT: usize = ShardIdent::SIZE_HINT + 4 + 32 + 32; + + type OnStackSlice = [u8; Self::SIZE_HINT]; + + fn serialize(&self, buffer: &mut T) { + self.shard.serialize(buffer); + buffer.write_raw_slice(&self.seqno.to_be_bytes()); + buffer.write_raw_slice(self.root_hash.as_slice()); + buffer.write_raw_slice(self.file_hash.as_slice()); + } + + fn deserialize(reader: &mut &[u8]) -> Result + where + Self: Sized, + { + debug_assert!(reader.remaining() >= Self::SIZE_HINT); + + let shard = ShardIdent::deserialize(reader)?; + let seqno = reader.get_u32(); + + let mut root_hash = HashBytes::default(); + root_hash.0.copy_from_slice(&reader[..32]); + let mut file_hash = HashBytes::default(); + file_hash.0.copy_from_slice(&reader[32..]); + + Ok(Self { + shard, + seqno, + root_hash, + file_hash, + }) + } +} + +impl StoredValue for ShardIdent { + /// 4 bytes workchain + /// 8 bytes shard + const SIZE_HINT: usize = 4 + 8; + + type OnStackSlice = [u8; Self::SIZE_HINT]; + + #[inline(always)] + fn serialize(&self, buffer: &mut T) { + buffer.write_raw_slice(&self.workchain().to_be_bytes()); + buffer.write_raw_slice(&self.prefix().to_be_bytes()); + } + + fn deserialize(reader: &mut &[u8]) -> Result + where + Self: Sized, + { + debug_assert!(reader.remaining() >= ShardIdent::SIZE_HINT); + + let workchain = reader.get_u32() as i32; + let prefix = reader.get_u64(); + Ok(unsafe { Self::new_unchecked(workchain, prefix) }) + } +} + +impl StoredValue for BlockIdShort { + /// 12 bytes shard ident + /// 4 bytes seqno + const SIZE_HINT: usize = ShardIdent::SIZE_HINT + 4; + + type OnStackSlice = [u8; Self::SIZE_HINT]; + + #[inline(always)] + fn serialize(&self, buffer: &mut T) { + self.shard.serialize(buffer); + buffer.write_raw_slice(&self.seqno.to_be_bytes()); + } + + fn deserialize(reader: &mut &[u8]) -> Result + where + Self: Sized, + { + debug_assert!(reader.remaining() >= BlockIdShort::SIZE_HINT); + + let shard = ShardIdent::deserialize(reader)?; + let seqno = reader.get_u32(); + Ok(Self { shard, seqno }) + } +} + +/// Writes `BlockIdExt` in little-endian format +pub fn write_block_id_le(block_id: &BlockId) -> [u8; 80] { + let mut bytes = [0u8; 80]; + bytes[..4].copy_from_slice(&block_id.shard.workchain().to_le_bytes()); + bytes[4..12].copy_from_slice(&block_id.shard.prefix().to_le_bytes()); + bytes[12..16].copy_from_slice(&block_id.seqno.to_le_bytes()); + bytes[16..48].copy_from_slice(block_id.root_hash.as_slice()); + bytes[48..80].copy_from_slice(block_id.file_hash.as_slice()); + bytes +} + +/// Reads `BlockId` in little-endian format +pub fn read_block_id_le(data: &[u8]) -> Option { + if data.len() < 80 { + return None; + } + + let mut workchain = [0; 4]; + workchain.copy_from_slice(&data[0..4]); + let workchain = i32::from_le_bytes(workchain); + + let mut shard = [0; 8]; + shard.copy_from_slice(&data[4..12]); + let shard = u64::from_le_bytes(shard); + + let mut seqno = [0; 4]; + seqno.copy_from_slice(&data[12..16]); + let seqno = u32::from_le_bytes(seqno); + + let mut root_hash = [0; 32]; + root_hash.copy_from_slice(&data[16..48]); + + let mut file_hash = [0; 32]; + file_hash.copy_from_slice(&data[48..80]); + + let shard = unsafe { ShardIdent::new_unchecked(workchain, shard) }; + + Some(BlockId { + shard, + seqno, + root_hash: root_hash.into(), + file_hash: file_hash.into(), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn fully_on_stack() { + assert!(!BlockId::default().to_vec().spilled()); + assert!(!BlockId::default().to_vec().spilled()); + } + + #[test] + fn correct_block_id_le_serialization() { + const SERIALIZED: [u8; 80] = [ + 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 128, 123, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + ]; + + let block_id = BlockId { + shard: ShardIdent::MASTERCHAIN, + seqno: 123, + root_hash: [1u8; 32].into(), + file_hash: [2u8; 32].into(), + }; + + let serialized = write_block_id_le(&block_id); + assert_eq!(serialized, SERIALIZED); + + assert_eq!(read_block_id_le(&serialized).unwrap(), block_id); + } +} diff --git a/storage/tests/everscale_zerostate.boc b/storage/tests/everscale_zerostate.boc new file mode 100644 index 000000000..6cea5582d Binary files /dev/null and b/storage/tests/everscale_zerostate.boc differ diff --git a/storage/tests/mod.rs b/storage/tests/mod.rs new file mode 100644 index 000000000..06ccfc2c8 --- /dev/null +++ b/storage/tests/mod.rs @@ -0,0 +1,166 @@ +use std::str::FromStr; + +use anyhow::Result; +use bytesize::ByteSize; +use everscale_types::boc::Boc; +use everscale_types::cell::{Cell, DynCell}; +use everscale_types::models::{BlockId, ShardState}; +use tycho_block_util::state::ShardStateStuff; +use tycho_storage::{BlockMetaData, Db, DbOptions, Storage}; + +#[derive(Clone)] +struct ShardStateCombined { + cell: Cell, + state: ShardState, +} + +impl ShardStateCombined { + fn from_file(path: impl AsRef) -> Result { + let bytes = std::fs::read(path.as_ref())?; + let cell = Boc::decode(&bytes)?; + let state = cell.parse()?; + Ok(Self { cell, state }) + } + + fn gen_utime(&self) -> Option { + match &self.state { + ShardState::Unsplit(s) => Some(s.gen_utime), + ShardState::Split(_) => None, + } + } + + fn min_ref_mc_seqno(&self) -> Option { + match &self.state { + ShardState::Unsplit(s) => Some(s.min_ref_mc_seqno), + ShardState::Split(_) => None, + } + } +} + +fn compare_cells(orig_cell: &DynCell, stored_cell: &DynCell) { + assert_eq!(orig_cell.repr_hash(), stored_cell.repr_hash()); + + let l = orig_cell.descriptor(); + let r = stored_cell.descriptor(); + + assert_eq!(l.d1, r.d1); + assert_eq!(l.d2, r.d2); + assert_eq!(orig_cell.data(), stored_cell.data()); + + for (orig_cell, stored_cell) in std::iter::zip(orig_cell.references(), stored_cell.references()) + { + compare_cells(orig_cell, stored_cell); + } +} + +#[tokio::test] +async fn persistent_storage_everscale() -> Result<()> { + tracing_subscriber::fmt::try_init().ok(); + + let tmp_dir = tempfile::tempdir()?; + let root_path = tmp_dir.path(); + + // Init rocksdb + let db_options = DbOptions { + rocksdb_lru_capacity: ByteSize::kb(1024), + cells_cache_size: ByteSize::kb(1024), + }; + let db = Db::open(root_path.join("db_storage"), db_options)?; + + // Init storage + let storage = Storage::new( + db, + root_path.join("file_storage"), + db_options.cells_cache_size.as_u64(), + )?; + assert!(storage.node_state().load_init_mc_block_id().is_err()); + + // Read zerostate + let zero_state_raw = ShardStateCombined::from_file("tests/everscale_zerostate.boc")?; + + // Parse block id + let block_id = BlockId::from_str("-1:8000000000000000:0:58ffca1a178daff705de54216e5433c9bd2e7d850070d334d38997847ab9e845:d270b87b2952b5ba7daa70aaf0a8c361befcf4d8d2db92f9640d5443070838e4")?; + + // Write zerostate to db + let (handle, _) = storage.block_handle_storage().create_or_load_handle( + &block_id, + BlockMetaData::zero_state(zero_state_raw.gen_utime().unwrap()), + )?; + + let zerostate = ShardStateStuff::new( + block_id, + zero_state_raw.cell.clone(), + storage.shard_state_storage().min_ref_mc_state(), + )?; + + storage + .shard_state_storage() + .store_state(&handle, &zerostate) + .await?; + + // Check seqno + let min_ref_mc_state = storage.shard_state_storage().min_ref_mc_state(); + assert_eq!(min_ref_mc_state.seqno(), zero_state_raw.min_ref_mc_seqno()); + + // Load zerostate from db + let loaded_state = storage + .shard_state_storage() + .load_state(zerostate.block_id()) + .await?; + + assert_eq!(zerostate.state(), loaded_state.state()); + assert_eq!(zerostate.block_id(), loaded_state.block_id()); + assert_eq!(zerostate.root_cell(), loaded_state.root_cell()); + + compare_cells( + zerostate.root_cell().as_ref(), + loaded_state.root_cell().as_ref(), + ); + + // Write persistent state to file + let persistent_state_keeper = storage.runtime_storage().persistent_state_keeper(); + assert!(persistent_state_keeper.current().is_none()); + + storage + .persistent_state_storage() + .prepare_persistent_states_dir(&zerostate.block_id())?; + + storage + .persistent_state_storage() + .save_state( + &zerostate.block_id(), + &zerostate.block_id(), + zero_state_raw.cell.repr_hash(), + ) + .await?; + + // Check if state exists + let exist = storage + .persistent_state_storage() + .state_exists(&zerostate.block_id(), &zerostate.block_id()); + assert_eq!(exist, true); + + // Read persistent state + let offset = 0u64; + let max_size = 1_000_000u64; + + let persistent_state_storage = storage.persistent_state_storage(); + let persistent_state_data = persistent_state_storage + .read_state_part( + &zerostate.block_id(), + &zerostate.block_id(), + offset, + max_size, + ) + .await + .unwrap(); + + // Check state + let cell = Boc::decode(&persistent_state_data)?; + assert_eq!(&cell, zerostate.root_cell()); + + // Clear files for test + tmp_dir.close()?; + + Ok(()) +} diff --git a/util/Cargo.toml b/util/Cargo.toml index f2b0a7fa5..53152926c 100644 --- a/util/Cargo.toml +++ b/util/Cargo.toml @@ -12,10 +12,12 @@ dashmap = "5.4" futures-util = "0.3" hex = "0.4" humantime = "2" +libc = "0.2" rand = "0.8" serde = { version = "1.0", features = ["derive"] } thiserror = "1.0" tokio = { version = "1", default-features = false, features = ["time", "sync", "rt"] } +tracing = "0.1" [dev-dependencies] tokio = { version = "1", default-features = false, features = [ diff --git a/util/src/lib.rs b/util/src/lib.rs index 0fb010594..40cbda55e 100644 --- a/util/src/lib.rs +++ b/util/src/lib.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::collections::HashSet; +pub mod progress_bar; pub mod serde_helpers; pub mod time; @@ -29,6 +30,7 @@ pub type FastDashMap = dashmap::DashMap; pub type FastDashSet = dashmap::DashSet; pub type FastHashMap = HashMap; pub type FastHashSet = HashSet; +pub type FastHasherState = ahash::RandomState; /// # Example /// diff --git a/util/src/progress_bar.rs b/util/src/progress_bar.rs new file mode 100644 index 000000000..a4d59ed15 --- /dev/null +++ b/util/src/progress_bar.rs @@ -0,0 +1,130 @@ +pub struct ProgressBar { + name: &'static str, + percentage_step: u64, + current: u64, + total: Option, + exact_unit: Option<&'static str>, + mapper: Box String + Send + 'static>, +} + +impl ProgressBar { + pub fn builder(name: &'static str) -> ProgressBarBuilder { + ProgressBarBuilder::new(name) + } + + pub fn set_total(&mut self, total: impl Into) { + self.total = Some(total.into()); + } + + pub fn set_progress(&mut self, current: impl Into) { + let old = self.compute_current_progress(); + self.current = current.into(); + let new = self.compute_current_progress(); + + if matches!( + (old, new), + (Some(old), Some(new)) if old / self.percentage_step != new / self.percentage_step + ) { + self.progress_message(); + } + } + + pub fn complete(&self) { + self.message("complete"); + } + + #[inline(always)] + fn progress_message(&self) { + let total = match self.total { + Some(total) if total > 0 => total, + _ => return, + }; + + let percent = self.current * 100 / total; + let current = (self.mapper)(self.current); + let total = (self.mapper)(total); + + match self.exact_unit { + Some(exact_unit) => self.message(format_args!( + "{percent}% ({current} / {total} {exact_unit})", + )), + None => self.message(format_args!("{percent}%")), + } + } + + #[inline(always)] + fn message(&self, text: impl std::fmt::Display) { + tracing::info!("{}... {text}", self.name); + } + + fn compute_current_progress(&self) -> Option { + self.total + .filter(|&total| total > 0) + .map(|total| self.current * 100u64 / total) + .map(From::from) + } +} + +pub struct ProgressBarBuilder { + name: &'static str, + percentage_step: u64, + total: Option, + exact_unit: Option<&'static str>, + mapper: Option String + Send + 'static>>, +} + +impl ProgressBarBuilder { + pub fn new(name: &'static str) -> Self { + Self { + name, + percentage_step: PERCENTAGE_STEP, + total: None, + exact_unit: None, + mapper: None, + } + } + + pub fn with_mapper(mut self, mapper: F) -> Self + where + F: Fn(u64) -> String + Send + 'static, + { + self.mapper = Some(Box::new(mapper)); + self + } + + pub fn percentage_step(mut self, step: u64) -> Self { + self.percentage_step = std::cmp::max(step, 1); + self + } + + pub fn total(mut self, total: impl Into) -> Self { + self.total = Some(total.into()); + self + } + + pub fn exact_unit(mut self, unit: &'static str) -> Self { + self.exact_unit = Some(unit); + self + } + + pub fn build(self) -> ProgressBar { + let pg = ProgressBar { + name: self.name, + percentage_step: self.percentage_step, + current: 0, + total: self.total, + exact_unit: self.exact_unit, + mapper: self.mapper.unwrap_or_else(|| Box::new(|x| x.to_string())), + }; + + if self.total.is_some() { + pg.progress_message(); + } else { + pg.message("estimating total"); + } + + pg + } +} + +const PERCENTAGE_STEP: u64 = 5;