Skip to content

Commit

Permalink
Site updated: 2024-05-13 16:16:40
Browse files Browse the repository at this point in the history
  • Loading branch information
username committed May 13, 2024
1 parent 7b10017 commit 39c899e
Show file tree
Hide file tree
Showing 32 changed files with 1,008 additions and 143 deletions.
2 changes: 1 addition & 1 deletion 2023/10/27/driver_develop/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ <h1 id="I2C"><a href="#I2C" class="headerlink" title="I2C"></a>I2C</h1><p>Wait t
</section>
<section class="post-nav">

<a class="prev" rel="prev" href="/2023/11/18/compilation_principle/">编译原理</a>
<a class="prev" rel="prev" href="/2023/11/11/Indexing/">Indexing</a>


<a class="next" rel="next" href="/2023/10/19/open-source-9.19-10.19/">开源的第一个月</a>
Expand Down
283 changes: 283 additions & 0 deletions 2023/11/11/Indexing/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
<!DOCTYPE html>
<html lang="zh-CN">

<head>
<meta charset="UTF-8">
<meta name="viewport"
content="width=device-width, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">

<meta name="author" content="修年">





<title>Indexing | 修年</title>



<link rel="icon" href="/favicon.ico">




<!-- stylesheets list from _config.yml -->

<link rel="stylesheet" href="/css/style.css">




<!-- scripts list from _config.yml -->

<script src="/js/script.js"></script>

<script src="/js/tocbot.min.js"></script>










<meta charset="UTF-8">
<title>live2d-demo</title>
<script src="https://apps.bdimg.com/libs/jquery/2.1.4/jquery.min.js"></script>
<!-- Live2DCubismCore -->
<script src="https://cdn.jsdelivr.net/gh/litstronger/live2d-moc3@master/js/frame/live2dcubismcore.min.js"></script>
<!-- Include Pixi. -->
<script src="https://cdnjs.cloudflare.com/ajax/libs/pixi.js/4.6.1/pixi.min.js"></script>
<!-- Include Cubism Components. -->
<script src="https://cdn.jsdelivr.net/gh/litstronger/live2d-moc3@master/js/live2dcubismframework.js"></script>
<script src="https://cdn.jsdelivr.net/gh/litstronger/live2d-moc3@master/js/live2dcubismpixi.js"></script>
<!-- User's Script -->
<script src="https://cdn.jsdelivr.net/gh/litstronger/live2d-moc3@master/js/l2d.js"></script>
<script src="https://cdn.jsdelivr.net/gh/litstronger/live2d-moc3@master/js/main.js"></script>
<style>
</style>
<meta name="generator" content="Hexo 5.4.2"></head>

<body>
<script>
// this function is used to check current theme before page loaded.
(() => {
const currentTheme = window.localStorage && window.localStorage.getItem('theme') || '';
const isDark = currentTheme === 'dark';
const pagebody = document.getElementsByTagName('body')[0]
if (isDark) {
pagebody.classList.add('dark-theme');
// mobile
document.getElementById("mobile-toggle-theme").innerText = "· Dark"
} else {
pagebody.classList.remove('dark-theme');
// mobile
document.getElementById("mobile-toggle-theme").innerText = "· Light"
}
})();
</script>

<div class="wrapper">
<header>
<nav class="navbar">
<div class="container">
<div class="navbar-header header-logo"><a href="/">Xiunian&#39;s Blog</a></div>
<div class="menu navbar-right">

<a class="menu-item" href="/about">About</a>

<a class="menu-item" href="/archives">Posts</a>

<a class="menu-item" href="/tag">Tags</a>

<input id="switch_default" type="checkbox" class="switch_default">
<label for="switch_default" class="toggleBtn"></label>
</div>
</div>
</nav>


<nav class="navbar-mobile" id="nav-mobile">
<div class="container">
<div class="navbar-header">
<div>
<a href="/">Xiunian&#39;s Blog</a><a id="mobile-toggle-theme">·&nbsp;Light</a>
</div>
<div class="menu-toggle" onclick="mobileBtn()">&#9776; Menu</div>
</div>
<div class="menu" id="mobile-menu">

<a class="menu-item" href="/about">About</a>

<a class="menu-item" href="/archives">Posts</a>

<a class="menu-item" href="/tag">Tags</a>

</div>
</div>
</nav>

</header>
<script>
var mobileBtn = function f() {
var toggleMenu = document.getElementsByClassName("menu-toggle")[0];
var mobileMenu = document.getElementById("mobile-menu");
if(toggleMenu.classList.contains("active")){
toggleMenu.classList.remove("active")
mobileMenu.classList.remove("active")
}else{
toggleMenu.classList.add("active")
mobileMenu.classList.add("active")
}
}
</script>
<div class="main">
<div class="container">


<div class="post-toc" style="right: -4em;">
<div class="tocbot-list">
</div>
<div class="tocbot-list-menu">
<a class="tocbot-toc-expand" onclick="expand_toc()">Expand all</a>
<a onclick="go_top()">Back to top</a>
<a onclick="go_bottom()">Go to bottom</a>
</div>
</div>

<script>
var tocbot_timer;
var DEPTH_MAX = 6; // 为 6 时展开所有
var tocbot_default_config = {
tocSelector: '.tocbot-list',
contentSelector: '.post-content',
headingSelector: 'h1, h2, h3, h4, h5',
orderedList: false,
scrollSmooth: true,
onClick: extend_click,
};

function extend_click() {
clearTimeout(tocbot_timer);
tocbot_timer = setTimeout(function() {
tocbot.refresh(obj_merge(tocbot_default_config, {
hasInnerContainers: true
}));
}, 420); // 这个值是由 tocbot 源码里定义的 scrollSmoothDuration 得来的
}

document.ready(function() {
tocbot.init(obj_merge(tocbot_default_config, {
collapseDepth: 1
}));
});

function expand_toc() {
var b = document.querySelector('.tocbot-toc-expand');
var expanded = b.getAttribute('data-expanded');
expanded ? b.removeAttribute('data-expanded') : b.setAttribute('data-expanded', true);
tocbot.refresh(obj_merge(tocbot_default_config, {
collapseDepth: expanded ? 1 : DEPTH_MAX
}));
b.innerText = expanded ? 'Expand all' : 'Collapse all';
}

function go_top() {
window.scrollTo(0, 0);
}

function go_bottom() {
window.scrollTo(0, document.body.scrollHeight);
}

function obj_merge(target, source) {
for (var item in source) {
if (source.hasOwnProperty(item)) {
target[item] = source[item];
}
}
return target;
}
</script>



<article class="post-wrap">
<header class="post-header">
<h1 class="post-title">Indexing</h1>

<div class="post-meta">

Author: <a itemprop="author" rel="author" href="/">修年</a>



<span class="post-time">
Date: <a href="#">十一月 11, 2023&nbsp;&nbsp;14:25:29</a>
</span>


</div>

</header>

<div class="post-content">
<h2 id="sparce-indexing"><a href="#sparce-indexing" class="headerlink" title="sparce indexing"></a>sparce indexing</h2><p>基于chunks的去重都要求使用full index,而这RAM一般承受不起,但是纯用disk io就太慢了。所以它利用了数据局部性:</p>
<p>If two pieces of backup streams share any chunks, they are likely to share many chunks. <u>如果两个segment共享了某个chunk,那么它们很有可能共享很多chunks。</u></p>
<p><strong>是这样的流程:</strong></p>
<ol>
<li>分段为segment;</li>
<li>计算该segment的每个chunk的fp,然后对每个chunk查询其对应的sparce indexing table: &lt;fp, segment_id&gt;,记录<u>可能跟它共享很多chunk的segment</u>的segment_id;</li>
<li>读取这些segment_id对应的segment的chunk indexing table(存储在disk中);</li>
<li>for every chunks: 重复,copy entry ;不重复,add to new container</li>
<li>最后再将该segment的信息写入磁盘,填写sparce indexing表。</li>
</ol>
<p>而sparce indexing表最一开始,由对input segment进行chunks的随机抽样得出(或者逐渐构建起来,反正大概是这个意思)</p>
<p>可以看到,它将segment info保留在disk中,在RAM中只保留fp2seg_id的映射,每次只需简单从磁盘中读取几个segment info即可,利用数据局部性极大地降低了磁盘IO次数。</p>
<p>Odess采用的就是类似这种capping+sparce indexing的方法。</p>
<p>将sparce indexing从原来的&lt;fp, seg_id&gt;改为&lt;fp, cid&gt;,并且每次只取top T个包含sample chunks最多的容器,从而将对segment进行cap修改为对container进行cap。仔细想想,这样确实依然保证了原算法的核心思想,也属于是segment size = container size的特种了。</p>
<p>不这个&lt;fp, cid&gt;不就是Odess中的recipe(或者说是全局指纹表)吗?乐。Odess也确实体现了这种capping+sparce indexing结合的方法【只不过进行简化了,每个chunk固定取其第一个container】。</p>

</div>


<section class="post-tags">
<div>
<span>Tag(s):</span>
<span class="tag">

</span>
</div>
<div>
<a href="javascript:window.history.back();">back</a>
<span>· </span>
<a href="/">home</a>
</div>
</section>
<section class="post-nav">

<a class="prev" rel="prev" href="/2023/11/18/compilation_principle/">编译原理</a>


<a class="next" rel="next" href="/2023/10/27/driver_develop/">驱动开发小记</a>

</section>


</article>
</div>

</div>
<footer id="footer" class="footer">
<div class="copyright">
<span>© 修年 | Powered by <a href="https://hexo.io" target="_blank">Hexo</a> & <a href="https://github.com/Siricee/hexo-theme-Chic" target="_blank">Chic</a></span>
</div>
</footer>

</div>
</body>

</html>
2 changes: 1 addition & 1 deletion 2023/11/18/compilation_principle/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,7 @@ <h1 id="拓展阅读"><a href="#拓展阅读" class="headerlink" title="拓展
<a class="prev" rel="prev" href="/2023/11/26/database/">数据库原理</a>


<a class="next" rel="next" href="/2023/10/27/driver_develop/">驱动开发小记</a>
<a class="next" rel="next" href="/2023/11/11/Indexing/">Indexing</a>

</section>

Expand Down
2 changes: 1 addition & 1 deletion 2023/12/10/deduplication_overview/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ <h3 id="dictionary-model-based-coding"><a href="#dictionary-model-based-coding"
<h3 id="delta-compression"><a href="#delta-compression" class="headerlink" title="delta compression"></a>delta compression</h3><p>它的提出是针对于小文件/相似chunk的。它的思想感觉有点类似密码学,大概是这样:</p>
<figure class="highlight css"><table><tr><td class="gutter"><pre><span class="line">1</span><br><span class="line">2</span><br><span class="line">3</span><br></pre></td><td class="code"><pre><span class="line">given file <span class="selector-tag">A</span>,<span class="selector-tag">B</span></span><br><span class="line">calc △ab,</span><br><span class="line">我们就可以通过△ab和<span class="selector-tag">B</span>来恢复出一个<span class="selector-tag">A</span></span><br></pre></td></tr></table></figure>

<p>目前正在尝试把它纳入到deduplication system中。</p>
<p>目前正在尝试把它纳入到deduplication system中。不过目前的瓶颈似乎是这样的,delta compression是要求要将当前chunk同base chunk进行对比,所以怎么找到base chunk就成了问题。</p>
<h3 id="Deduplication"><a href="#Deduplication" class="headerlink" title="Deduplication"></a>Deduplication</h3><p>总之,在compression byte-by-byte识别redundant data这样粒度太小的劣势下,通过计算“cryptographically secure hash-based fingerprints”来识别redundant data的chunk-level的deduplication优势就来了!</p>
<h3 id="Overview"><a href="#Overview" class="headerlink" title="Overview"></a>Overview</h3><p><img src="/2023/12/10/deduplication_overview/image-20231210223322234.png" alt="image-20231210223322234"></p>
<p>这里也是给了一张很棒的图来总结了上文。</p>
Expand Down
21 changes: 1 addition & 20 deletions 2023/12/10/deduplication_system_articles/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -228,26 +228,7 @@ <h1 class="post-title">Deduplication System相关文章</h1>
<blockquote>
<p>各个超链接导向对应的文章分链接。</p>
</blockquote>
<h1 id="Deduplication"><a href="#Deduplication" class="headerlink" title="Deduplication"></a>Deduplication</h1><h2 id="综述"><a href="#综述" class="headerlink" title="综述"></a><a href="/2023/12/10/deduplication_overview">综述</a></h2><h2 id="Indexing"><a href="#Indexing" class="headerlink" title="Indexing"></a>Indexing</h2><h3 id="sparce-indexing"><a href="#sparce-indexing" class="headerlink" title="sparce indexing"></a>sparce indexing</h3><p>基于chunks的去重都要求使用full index,而这RAM一般承受不起,但是纯用disk io就太慢了。所以它利用了数据局部性:</p>
<p>If two pieces of backup streams share any chunks, they are likely to share many chunks. <u>如果两个segment共享了某个chunk,那么它们很有可能共享很多chunks。</u></p>
<p><strong>是这样的流程:</strong></p>
<ol>
<li>分段为segment;</li>
<li>计算该segment的每个chunk的fp,然后对每个chunk查询其对应的sparce indexing table: &lt;fp, segment_id&gt;,记录<u>可能跟它共享很多chunk的segment</u>的segment_id;</li>
<li>读取这些segment_id对应的segment的chunk indexing table(存储在disk中);</li>
<li>for every chunks: 重复,copy entry ;不重复,add to new container</li>
<li>最后再将该segment的信息写入磁盘,填写sparce indexing表。</li>
</ol>
<p>而sparce indexing表最一开始,由对input segment进行chunks的随机抽样得出(或者逐渐构建起来,反正大概是这个意思)</p>
<p>可以看到,它将segment info保留在disk中,在RAM中只保留fp2seg_id的映射,每次只需简单从磁盘中读取几个segment info即可,利用数据局部性极大地降低了磁盘IO次数。</p>
<p>Odess采用的就是类似这种capping+sparce indexing的方法。</p>
<p>将sparce indexing从原来的&lt;fp, seg_id&gt;改为&lt;fp, cid&gt;,并且每次只取top T个包含sample chunks最多的容器,从而将对segment进行cap修改为对container进行cap。仔细想想,这样确实依然保证了原算法的核心思想,也属于是segment size = container size的特种了。</p>
<p>不这个&lt;fp, cid&gt;不就是Odess中的recipe(或者说是全局指纹表)吗?乐。Odess也确实体现了这种capping+sparce indexing结合的方法【只不过进行简化了,每个chunk固定取其第一个container】。</p>
<h2 id="Chunking"><a href="#Chunking" class="headerlink" title="Chunking"></a>Chunking</h2><h3 id="FastCDC"><a href="#FastCDC" class="headerlink" title="FastCDC"></a><a href="/2023/12/08/fastcdc">FastCDC</a></h3><h2 id="Fragment"><a href="#Fragment" class="headerlink" title="Fragment"></a>Fragment</h2><h3 id="data-layout"><a href="#data-layout" class="headerlink" title="data layout"></a>data layout</h3><h4 id="MFDedup"><a href="#MFDedup" class="headerlink" title="MFDedup"></a><a href="/2023/10/11/MFDedup">MFDedup</a></h4><p>有机会可以再看看代码实现。</p>
<h3 id="rewrite"><a href="#rewrite" class="headerlink" title="rewrite"></a>rewrite</h3><h4 id="capping"><a href="#capping" class="headerlink" title="capping"></a><a href="/2024/01/07/Capping">capping</a></h4><p>这篇文章的测试做得很友好很完善,值得精读。</p>
<p>对stream进行分段为segment;限制每个版本的容器数(主要是指引用的旧容器数);将那些包含重复块rate较小的容器所包含的重复块视为unique block进行rewrite。</p>
<h4 id="SMR-amp-amp-DePFC"><a href="#SMR-amp-amp-DePFC" class="headerlink" title="SMR &amp;&amp; DePFC"></a><a href="/2024/05/11/SMR_DePFC">SMR &amp;&amp; DePFC</a></h4><p>非常impressive的两个方法</p>
<h1 id="Restore"><a href="#Restore" class="headerlink" title="Restore"></a>Restore</h1><h2 id="cache"><a href="#cache" class="headerlink" title="cache"></a>cache</h2><h2 id="recipe"><a href="#recipe" class="headerlink" title="recipe"></a>recipe</h2><h3 id="forward-assembly"><a href="#forward-assembly" class="headerlink" title="forward-assembly"></a>forward-assembly</h3><h3 id="OdessStorage"><a href="#OdessStorage" class="headerlink" title="OdessStorage"></a>OdessStorage</h3><h1 id="GC"><a href="#GC" class="headerlink" title="GC"></a><a href="/2023/12/09/deduplication_GC">GC</a></h1>
<h1 id="综述"><a href="#综述" class="headerlink" title="综述"></a><a href="/2023/12/10/deduplication_overview">综述</a></h1><h1 id="Indexing"><a href="#Indexing" class="headerlink" title="Indexing"></a><a href="/2023/11/11/Indexing">Indexing</a></h1><h1 id="Chunking"><a href="#Chunking" class="headerlink" title="Chunking"></a>Chunking</h1><h2 id="FastCDC"><a href="#FastCDC" class="headerlink" title="FastCDC"></a><a href="/2023/12/08/fastcdc">FastCDC</a></h2><h1 id="Fragment"><a href="#Fragment" class="headerlink" title="Fragment"></a>Fragment</h1><h2 id="data-layout"><a href="#data-layout" class="headerlink" title="data layout"></a>data layout</h2><h3 id="MFDedup"><a href="#MFDedup" class="headerlink" title="MFDedup"></a><a href="/2023/10/11/MFDedup">MFDedup</a></h3><h2 id="Rewriting"><a href="#Rewriting" class="headerlink" title="Rewriting"></a><a href="/2024/05/11/Rewriting">Rewriting</a></h2><h1 id="GC"><a href="#GC" class="headerlink" title="GC"></a><a href="/2023/12/09/deduplication_GC">GC</a></h1>
</div>


Expand Down
2 changes: 1 addition & 1 deletion 2024/04/27/algorithm_questions/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ <h2 id="栈和队列"><a href="#栈和队列" class="headerlink" title="栈和
</section>
<section class="post-nav">

<a class="prev" rel="prev" href="/2024/05/11/SMR_DePFC/">SMR</a>
<a class="prev" rel="prev" href="/2024/05/11/Rewriting/">Rewriting</a>


<a class="next" rel="next" href="/2024/04/21/userspace_scheduling_framework/">COS: A User-space Scheduling Framework</a>
Expand Down
File renamed without changes
File renamed without changes
Loading

0 comments on commit 39c899e

Please sign in to comment.